1use std::collections::HashMap;
41use std::fs::File;
42use std::io::Read;
43use std::path::{Path, PathBuf};
44
45use anyhow::{Context, Result};
46use memmap2::Mmap;
47use wordnet_types::{
48 Frame, Gloss, IndexEntry, Lemma, Pointer, Pos, Synset, SynsetId, SynsetType, decode_st,
49};
50
51#[derive(Clone, Copy, Debug, Eq, PartialEq)]
53pub enum LoadMode {
54 Mmap,
56 Owned,
58}
59
60enum Buffer {
61 Mmap(Mmap),
62 Owned(Vec<u8>),
63}
64
65impl Buffer {
66 fn as_slice(&self) -> &[u8] {
67 match self {
68 Buffer::Mmap(m) => m.as_ref(),
69 Buffer::Owned(v) => v.as_slice(),
70 }
71 }
72}
73
74#[derive(Clone, Copy, Debug)]
75enum FileKind {
76 DataNoun,
77 DataVerb,
78 DataAdj,
79 DataAdv,
80 IndexNoun,
81 IndexVerb,
82 IndexAdj,
83 IndexAdv,
84 Frames,
85 Cntlist,
86}
87
88#[derive(Clone, Copy)]
89struct TextRef {
90 file: FileKind,
91 start: usize,
92 len: usize,
93}
94
95struct DictFiles {
96 data_noun: Buffer,
97 data_verb: Buffer,
98 data_adj: Buffer,
99 data_adv: Buffer,
100 index_noun: Buffer,
101 index_verb: Buffer,
102 index_adj: Buffer,
103 index_adv: Buffer,
104 frames: Option<Buffer>,
105 cntlist: Option<Buffer>,
106}
107
108impl DictFiles {
109 fn load(dict_dir: &Path, mode: LoadMode) -> Result<Self> {
110 let data_noun = load_file(dict_dir.join("data.noun"), mode)?;
111 let data_verb = load_file(dict_dir.join("data.verb"), mode)?;
112 let data_adj = load_file(dict_dir.join("data.adj"), mode)?;
113 let data_adv = load_file(dict_dir.join("data.adv"), mode)?;
114 let index_noun = load_file(dict_dir.join("index.noun"), mode)?;
115 let index_verb = load_file(dict_dir.join("index.verb"), mode)?;
116 let index_adj = load_file(dict_dir.join("index.adj"), mode)?;
117 let index_adv = load_file(dict_dir.join("index.adv"), mode)?;
118 let frames = load_optional_file(dict_dir.join("frames.vrb"), mode)?;
119 let cntlist = load_optional_file(dict_dir.join("cntlist.rev"), mode)?;
120
121 Ok(Self {
122 data_noun,
123 data_verb,
124 data_adj,
125 data_adv,
126 index_noun,
127 index_verb,
128 index_adj,
129 index_adv,
130 frames,
131 cntlist,
132 })
133 }
134
135 fn bytes(&self, file: FileKind) -> &[u8] {
136 match file {
137 FileKind::DataNoun => self.data_noun.as_slice(),
138 FileKind::DataVerb => self.data_verb.as_slice(),
139 FileKind::DataAdj => self.data_adj.as_slice(),
140 FileKind::DataAdv => self.data_adv.as_slice(),
141 FileKind::IndexNoun => self.index_noun.as_slice(),
142 FileKind::IndexVerb => self.index_verb.as_slice(),
143 FileKind::IndexAdj => self.index_adj.as_slice(),
144 FileKind::IndexAdv => self.index_adv.as_slice(),
145 FileKind::Frames => self.frames.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
146 FileKind::Cntlist => self.cntlist.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
147 }
148 }
149
150 fn text(&self, r: TextRef) -> &str {
151 let bytes = self.bytes(r.file);
152 let slice = &bytes[r.start..r.start + r.len];
153 std::str::from_utf8(slice).expect("wordnet text is valid utf8")
154 }
155}
156
157struct LemmaData {
158 text: TextRef,
159 lex_id: u8,
160}
161
162struct PointerData {
163 symbol: TextRef,
164 target: SynsetId,
165 src_word: Option<u16>,
166 dst_word: Option<u16>,
167}
168
169struct GlossData {
170 raw: TextRef,
171 definition: TextRef,
172 examples: Vec<TextRef>,
173}
174
175struct SynsetData {
176 id: SynsetId,
177 lex_filenum: u8,
178 synset_type: SynsetType,
179 words: Vec<LemmaData>,
180 pointers: Vec<PointerData>,
181 frames: Vec<Frame>,
182 gloss: GlossData,
183}
184
185struct IndexEntryData {
186 lemma: TextRef,
187 synset_cnt: u32,
188 p_cnt: u32,
189 ptr_symbols: Vec<TextRef>,
190 sense_cnt: u32,
191 tagsense_cnt: u32,
192 synset_offsets: Vec<u32>,
193}
194
195pub struct WordNet {
197 files: DictFiles,
198 index: HashMap<(Pos, String), IndexEntryData>,
199 synsets: HashMap<SynsetId, SynsetData>,
200 lemma_to_synsets: HashMap<(Pos, String), Vec<SynsetId>>,
201 verb_frames_text: HashMap<u16, TextRef>,
202 sense_counts: HashMap<(String, Pos, u32), u32>,
203}
204
205impl WordNet {
206 pub fn load(dict_dir: impl AsRef<Path>) -> Result<Self> {
211 Self::load_with_mode(dict_dir, LoadMode::Mmap)
212 }
213
214 pub fn load_with_mode(dict_dir: impl AsRef<Path>, mode: LoadMode) -> Result<Self> {
216 let dir = dict_dir.as_ref();
217 let required = [
218 "data.noun",
219 "data.verb",
220 "data.adj",
221 "data.adv",
222 "index.noun",
223 "index.verb",
224 "index.adj",
225 "index.adv",
226 ];
227 for name in &required {
228 let path = dir.join(name);
229 if !path.exists() {
230 anyhow::bail!("missing required WordNet file: {}", path.display());
231 }
232 }
233
234 let files = DictFiles::load(dir, mode)?;
235
236 let mut index = HashMap::new();
237 let mut lemma_to_synsets = HashMap::new();
238 parse_index(
239 files.bytes(FileKind::IndexNoun),
240 FileKind::IndexNoun,
241 Pos::Noun,
242 &mut index,
243 &mut lemma_to_synsets,
244 )?;
245 parse_index(
246 files.bytes(FileKind::IndexVerb),
247 FileKind::IndexVerb,
248 Pos::Verb,
249 &mut index,
250 &mut lemma_to_synsets,
251 )?;
252 parse_index(
253 files.bytes(FileKind::IndexAdj),
254 FileKind::IndexAdj,
255 Pos::Adj,
256 &mut index,
257 &mut lemma_to_synsets,
258 )?;
259 parse_index(
260 files.bytes(FileKind::IndexAdv),
261 FileKind::IndexAdv,
262 Pos::Adv,
263 &mut index,
264 &mut lemma_to_synsets,
265 )?;
266
267 let mut synsets = HashMap::new();
268 parse_data(
269 files.bytes(FileKind::DataNoun),
270 FileKind::DataNoun,
271 Pos::Noun,
272 &mut synsets,
273 )?;
274 parse_data(
275 files.bytes(FileKind::DataVerb),
276 FileKind::DataVerb,
277 Pos::Verb,
278 &mut synsets,
279 )?;
280 parse_data(
281 files.bytes(FileKind::DataAdj),
282 FileKind::DataAdj,
283 Pos::Adj,
284 &mut synsets,
285 )?;
286 parse_data(
287 files.bytes(FileKind::DataAdv),
288 FileKind::DataAdv,
289 Pos::Adv,
290 &mut synsets,
291 )?;
292
293 let verb_frames_text = parse_frames_vrb(files.bytes(FileKind::Frames));
294 let sense_counts = parse_cntlist(files.bytes(FileKind::Cntlist));
295
296 Ok(Self {
297 files,
298 index,
299 synsets,
300 lemma_to_synsets,
301 verb_frames_text,
302 sense_counts,
303 })
304 }
305
306 pub fn lemma_exists(&self, pos: Pos, lemma: &str) -> bool {
308 let key = (pos, normalize_lemma(lemma));
309 self.lemma_to_synsets.contains_key(&key)
310 }
311
312 pub fn index_entry(&self, pos: Pos, lemma: &str) -> Option<IndexEntry<'_>> {
314 let key = (pos, normalize_lemma(lemma));
315 self.index.get(&key).map(|entry| IndexEntry {
316 lemma: self.files.text(entry.lemma),
317 pos,
318 synset_cnt: entry.synset_cnt,
319 p_cnt: entry.p_cnt,
320 ptr_symbols: entry
321 .ptr_symbols
322 .iter()
323 .map(|r| self.files.text(*r))
324 .collect(),
325 sense_cnt: entry.sense_cnt,
326 tagsense_cnt: entry.tagsense_cnt,
327 synset_offsets: entry.synset_offsets.as_slice(),
328 })
329 }
330
331 pub fn synsets_for_lemma(&self, pos: Pos, lemma: &str) -> &[SynsetId] {
333 static EMPTY: [SynsetId; 0] = [];
334 let key = (pos, normalize_lemma(lemma));
335 self.lemma_to_synsets
336 .get(&key)
337 .map(|v| v.as_slice())
338 .unwrap_or(&EMPTY)
339 }
340
341 pub fn get_synset(&self, id: SynsetId) -> Option<Synset<'_>> {
343 self.synsets.get(&id).map(|syn| self.make_synset_view(syn))
344 }
345
346 pub fn iter_synsets(&self) -> impl Iterator<Item = Synset<'_>> + '_ {
348 self.synsets.values().map(|s| self.make_synset_view(s))
349 }
350
351 pub fn index_count(&self) -> usize {
353 self.index.len()
354 }
355
356 pub fn lemma_count(&self) -> usize {
358 self.lemma_to_synsets.len()
359 }
360
361 pub fn synset_count(&self) -> usize {
363 self.synsets.len()
364 }
365
366 pub fn verb_frame_templates_count(&self) -> usize {
368 self.verb_frames_text.len()
369 }
370
371 pub fn sense_count_entries(&self) -> usize {
373 self.sense_counts.len()
374 }
375
376 fn make_synset_view<'a>(&'a self, data: &'a SynsetData) -> Synset<'a> {
377 let words = data
378 .words
379 .iter()
380 .map(|w| Lemma {
381 text: self.files.text(w.text),
382 lex_id: w.lex_id,
383 })
384 .collect();
385 let pointers = data
386 .pointers
387 .iter()
388 .map(|p| Pointer {
389 symbol: self.files.text(p.symbol),
390 target: p.target,
391 src_word: p.src_word,
392 dst_word: p.dst_word,
393 })
394 .collect();
395 let gloss = Gloss {
396 raw: self.files.text(data.gloss.raw),
397 definition: self.files.text(data.gloss.definition),
398 examples: data
399 .gloss
400 .examples
401 .iter()
402 .map(|r| self.files.text(*r))
403 .collect(),
404 };
405
406 Synset {
407 id: data.id,
408 lex_filenum: data.lex_filenum,
409 synset_type: data.synset_type,
410 words,
411 pointers,
412 frames: data.frames.as_slice(),
413 gloss,
414 }
415 }
416}
417
418fn load_file(path: PathBuf, mode: LoadMode) -> Result<Buffer> {
419 match mode {
420 LoadMode::Mmap => {
421 let file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
422 unsafe { Mmap::map(&file) }
423 .map(Buffer::Mmap)
424 .with_context(|| format!("mmap {}", path.display()))
425 }
426 LoadMode::Owned => {
427 let mut file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
428 let mut buf = Vec::new();
429 file.read_to_end(&mut buf)
430 .with_context(|| format!("read {}", path.display()))?;
431 Ok(Buffer::Owned(buf))
432 }
433 }
434}
435
436fn load_optional_file(path: PathBuf, mode: LoadMode) -> Result<Option<Buffer>> {
437 if !path.exists() {
438 return Ok(None);
439 }
440 load_file(path, mode).map(Some)
441}
442
443fn parse_index(
444 bytes: &[u8],
445 file: FileKind,
446 pos: Pos,
447 index: &mut HashMap<(Pos, String), IndexEntryData>,
448 lemma_to_synsets: &mut HashMap<(Pos, String), Vec<SynsetId>>,
449) -> Result<()> {
450 for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
451 let line = strip_cr(raw_line);
452 if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
453 continue;
454 }
455 let line_str = std::str::from_utf8(line)?;
456 let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
457 if tokens.len() < 6 {
458 anyhow::bail!(
459 "{:?}:{} malformed index line (too few tokens)",
460 file,
461 lineno + 1
462 );
463 }
464
465 let lemma_token = tokens[0];
466 let lemma_ref = text_ref_str(file, bytes, lemma_token);
467 let lemma_key = normalize_lemma(lemma_token);
468
469 let synset_cnt: u32 = tokens[2]
470 .parse()
471 .with_context(|| format!("index {:?}:{} synset_cnt", file, lineno + 1))?;
472 let p_cnt: u32 = tokens[3]
473 .parse()
474 .with_context(|| format!("index {:?}:{} p_cnt", file, lineno + 1))?;
475
476 let expected_ptrs = p_cnt as usize;
477 let mut idx = 4;
478 if tokens.len() < idx + expected_ptrs {
479 anyhow::bail!("{:?}:{} pointer count mismatch", file, lineno + 1);
480 }
481 let ptr_symbols = tokens[idx..idx + expected_ptrs]
482 .iter()
483 .map(|sym| text_ref_str(file, bytes, sym))
484 .collect::<Vec<_>>();
485 idx += expected_ptrs;
486 if tokens.len() < idx + 2 {
487 anyhow::bail!("{:?}:{} missing sense counts", file, lineno + 1);
488 }
489 let sense_cnt: u32 = tokens[idx]
490 .parse()
491 .with_context(|| format!("index {:?}:{} sense_cnt", file, lineno + 1))?;
492 idx += 1;
493 let tagsense_cnt: u32 = tokens[idx]
494 .parse()
495 .with_context(|| format!("index {:?}:{} tagsense_cnt", file, lineno + 1))?;
496 idx += 1;
497
498 let offsets: Vec<u32> = tokens[idx..]
499 .iter()
500 .map(|t| {
501 t.parse::<u32>()
502 .with_context(|| format!("index {:?}:{} synset_offsets", file, lineno + 1))
503 })
504 .collect::<Result<_>>()?;
505 if offsets.len() != synset_cnt as usize {
506 anyhow::bail!(
507 "{:?}:{} synset_cnt mismatch (expected {}, got {})",
508 file,
509 lineno + 1,
510 synset_cnt,
511 offsets.len()
512 );
513 }
514
515 index.insert(
516 (pos, lemma_key.clone()),
517 IndexEntryData {
518 lemma: lemma_ref,
519 synset_cnt,
520 p_cnt,
521 ptr_symbols,
522 sense_cnt,
523 tagsense_cnt,
524 synset_offsets: offsets.clone(),
525 },
526 );
527 lemma_to_synsets.insert(
528 (pos, lemma_key),
529 offsets
530 .into_iter()
531 .map(|offset| SynsetId { pos, offset })
532 .collect(),
533 );
534 }
535
536 Ok(())
537}
538
539fn parse_data(
540 bytes: &[u8],
541 file: FileKind,
542 pos: Pos,
543 synsets: &mut HashMap<SynsetId, SynsetData>,
544) -> Result<()> {
545 for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
546 let line = strip_cr(raw_line);
547 if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
548 continue;
549 }
550 let line_str = std::str::from_utf8(line)?;
551 let (left, gloss_part) = match line_str.split_once('|') {
552 Some((l, r)) => (l.trim(), r.trim()),
553 None => (line_str.trim(), ""),
554 };
555
556 let tokens: Vec<&str> = left.split_ascii_whitespace().collect();
557 if tokens.len() < 4 {
558 anyhow::bail!("{:?}:{} malformed data line", file, lineno + 1);
559 }
560
561 let offset: u32 = tokens[0]
562 .parse()
563 .with_context(|| format!("{:?}:{} offset", file, lineno + 1))?;
564 let lex_filenum: u8 = tokens[1]
565 .parse()
566 .with_context(|| format!("{:?}:{} lex_filenum", file, lineno + 1))?;
567 let ss_type_char = tokens[2]
568 .chars()
569 .next()
570 .ok_or_else(|| anyhow::anyhow!("{:?}:{} missing ss_type", file, lineno + 1))?;
571 let synset_type = SynsetType::from_char(ss_type_char).ok_or_else(|| {
572 anyhow::anyhow!("{:?}:{} invalid ss_type {}", file, lineno + 1, ss_type_char)
573 })?;
574 let w_cnt: usize = usize::from_str_radix(tokens[3], 16)
575 .with_context(|| format!("{:?}:{} w_cnt", file, lineno + 1))?;
576
577 let mut idx = 4;
578 if tokens.len() < idx + (w_cnt * 2) {
579 anyhow::bail!("{:?}:{} not enough word/lex_id pairs", file, lineno + 1);
580 }
581 let mut words = Vec::with_capacity(w_cnt);
582 for _ in 0..w_cnt {
583 let text_token = tokens[idx];
584 let lex_id_token = tokens[idx + 1];
585 let lex_id: u8 = u8::from_str_radix(lex_id_token, 16)
586 .with_context(|| format!("{:?}:{} lex_id", file, lineno + 1))?;
587 words.push(LemmaData {
588 text: text_ref_str(file, bytes, text_token),
589 lex_id,
590 });
591 idx += 2;
592 }
593
594 if tokens.len() <= idx {
595 anyhow::bail!("{:?}:{} missing pointer count", file, lineno + 1);
596 }
597 let p_cnt: usize = tokens[idx]
598 .parse()
599 .with_context(|| format!("{:?}:{} p_cnt", file, lineno + 1))?;
600 idx += 1;
601
602 let mut pointers = Vec::with_capacity(p_cnt);
603 for _ in 0..p_cnt {
604 if tokens.len() < idx + 4 {
605 anyhow::bail!("{:?}:{} incomplete pointer block", file, lineno + 1);
606 }
607 let symbol = tokens[idx];
608 let target_offset: u32 = tokens[idx + 1]
609 .parse()
610 .with_context(|| format!("{:?}:{} pointer target offset", file, lineno + 1))?;
611 let target_pos = tokens[idx + 2]
612 .chars()
613 .next()
614 .and_then(Pos::from_char)
615 .ok_or_else(|| anyhow::anyhow!("{:?}:{} pointer target pos", file, lineno + 1))?;
616 let (src_word, dst_word) = decode_st(tokens[idx + 3]);
617 pointers.push(PointerData {
618 symbol: text_ref_str(file, bytes, symbol),
619 target: SynsetId {
620 pos: target_pos,
621 offset: target_offset,
622 },
623 src_word,
624 dst_word,
625 });
626 idx += 4;
627 }
628
629 let mut frames = Vec::new();
630 if matches!(pos, Pos::Verb) {
631 let f_cnt: usize = if tokens.len() <= idx {
632 0
633 } else {
634 let v: usize = tokens[idx]
635 .parse()
636 .with_context(|| format!("{:?}:{} f_cnt", file, lineno + 1))?;
637 idx += 1;
638 v
639 };
640 for _ in 0..f_cnt {
641 if tokens.len() < idx + 3 {
642 anyhow::bail!("{:?}:{} incomplete frame entry", file, lineno + 1);
643 }
644 if tokens[idx] != "+" {
645 anyhow::bail!("{:?}:{} expected '+' before frame entry", file, lineno + 1);
646 }
647 let frame_number: u16 = tokens[idx + 1]
648 .parse()
649 .with_context(|| format!("{:?}:{} frame_number", file, lineno + 1))?;
650 let word_number = parse_word_number(tokens[idx + 2]);
651 frames.push(Frame {
652 frame_number,
653 word_number,
654 });
655 idx += 3;
656 }
657 }
658
659 let gloss = parse_gloss(file, bytes, gloss_part)?;
660 let id = SynsetId { pos, offset };
661 synsets.insert(
662 id,
663 SynsetData {
664 id,
665 lex_filenum,
666 synset_type,
667 words,
668 pointers,
669 frames,
670 gloss,
671 },
672 );
673 }
674
675 Ok(())
676}
677
678fn parse_gloss(file: FileKind, root: &[u8], gloss: &str) -> Result<GlossData> {
679 let trimmed = gloss.trim();
680 let gloss_raw = text_ref_str(file, root, trimmed);
681
682 let mut examples = Vec::new();
683 let mut in_quote = false;
684 let mut quote_start: Option<usize> = None;
685 let mut def_end = trimmed.len();
686 for (idx, ch) in trimmed.char_indices() {
687 match ch {
688 '"' => {
689 if in_quote {
690 if let Some(start) = quote_start.take()
691 && idx > start + 1
692 {
693 let start_bytes =
694 trimmed.as_ptr() as usize + start + 1 - root.as_ptr() as usize;
695 examples.push(TextRef {
696 file,
697 start: start_bytes,
698 len: idx - start - 1,
699 });
700 }
701 } else {
702 quote_start = Some(idx);
703 }
704 in_quote = !in_quote;
705 }
706 ';' if !in_quote && def_end == trimmed.len() => {
707 def_end = idx;
708 }
709 _ => {}
710 }
711 }
712
713 let definition_slice = trimmed[..def_end].trim();
714 let def_start = definition_slice.as_ptr() as usize - trimmed.as_ptr() as usize;
715
716 let definition = TextRef {
717 file,
718 start: trimmed.as_ptr() as usize + def_start - root.as_ptr() as usize,
719 len: definition_slice.len(),
720 };
721
722 Ok(GlossData {
723 raw: gloss_raw,
724 definition,
725 examples,
726 })
727}
728
729fn parse_frames_vrb(bytes: &[u8]) -> HashMap<u16, TextRef> {
730 let mut frames = HashMap::new();
731 for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
732 let line = strip_cr(raw_line);
733 if line.is_empty() {
734 continue;
735 }
736 let line_str = match std::str::from_utf8(line) {
737 Ok(s) => s,
738 Err(_) => continue,
739 };
740 let mut parts = line_str.splitn(2, ' ');
741 let num = parts.next().and_then(|t| t.parse::<u16>().ok());
742 let text = parts.next().map(str::trim).unwrap_or("");
743 if let Some(n) = num {
744 let start = text.as_ptr() as usize - bytes.as_ptr() as usize;
745 frames.insert(
746 n,
747 TextRef {
748 file: FileKind::Frames,
749 start,
750 len: text.len(),
751 },
752 );
753 } else {
754 eprintln!("frames.vrb:{} invalid frame number", lineno + 1);
755 }
756 }
757 frames
758}
759
760fn parse_cntlist(bytes: &[u8]) -> HashMap<(String, Pos, u32), u32> {
761 let mut counts = HashMap::new();
762 for raw_line in bytes.split(|b| *b == b'\n') {
763 let line = strip_cr(raw_line);
764 if line.is_empty() {
765 continue;
766 }
767 let line_str = match std::str::from_utf8(line) {
768 Ok(s) => s,
769 Err(_) => continue,
770 };
771 let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
772 if tokens.len() < 3 {
773 continue;
774 }
775 let count: u32 = match tokens[0].parse() {
776 Ok(c) => c,
777 Err(_) => continue,
778 };
779 let lemma = normalize_lemma(tokens[1]);
781 let pos = tokens[2]
782 .chars()
783 .next()
784 .and_then(Pos::from_char)
785 .unwrap_or(Pos::Noun);
786 let sense_number: u32 = tokens.get(3).and_then(|t| t.parse().ok()).unwrap_or(1);
787 counts.insert((lemma, pos, sense_number), count);
788 }
789 counts
790}
791
792fn text_ref_str(file: FileKind, root: &[u8], token: &str) -> TextRef {
793 let start = token.as_ptr() as usize - root.as_ptr() as usize;
794 TextRef {
795 file,
796 start,
797 len: token.len(),
798 }
799}
800
801fn strip_cr(line: &[u8]) -> &[u8] {
802 if line.ends_with(b"\r") {
803 &line[..line.len() - 1]
804 } else {
805 line
806 }
807}
808
809fn parse_word_number(token: &str) -> Option<u16> {
810 u16::from_str_radix(token, 16)
811 .or_else(|_| token.parse::<u16>())
812 .ok()
813 .and_then(|v| if v == 0 { None } else { Some(v) })
814}
815
816fn normalize_lemma(text: &str) -> String {
817 let mut s = text.trim().to_string();
818 s.make_ascii_lowercase();
819 s.replace(' ', "_")
820}