1use std::collections::HashMap;
41use std::fs::File;
42use std::io::Read;
43use std::path::{Path, PathBuf};
44
45use anyhow::{Context, Result};
46use memmap2::Mmap;
47use wordnet_types::{
48 Frame, Gloss, IndexEntry, Lemma, Pointer, Pos, Synset, SynsetId, SynsetType, decode_st,
49};
50
51#[derive(Clone, Copy, Debug, Eq, PartialEq)]
53pub enum LoadMode {
54 Mmap,
56 Owned,
58}
59
60enum Buffer {
61 Mmap(Mmap),
62 Owned(Vec<u8>),
63}
64
65impl Buffer {
66 fn as_slice(&self) -> &[u8] {
67 match self {
68 Buffer::Mmap(m) => m.as_ref(),
69 Buffer::Owned(v) => v.as_slice(),
70 }
71 }
72}
73
74#[derive(Clone, Copy, Debug)]
75enum FileKind {
76 DataNoun,
77 DataVerb,
78 DataAdj,
79 DataAdv,
80 IndexNoun,
81 IndexVerb,
82 IndexAdj,
83 IndexAdv,
84 Frames,
85 Cntlist,
86}
87
88#[derive(Clone, Copy)]
89struct TextRef {
90 file: FileKind,
91 start: usize,
92 len: usize,
93}
94
95struct DictFiles {
96 data_noun: Buffer,
97 data_verb: Buffer,
98 data_adj: Buffer,
99 data_adv: Buffer,
100 index_noun: Buffer,
101 index_verb: Buffer,
102 index_adj: Buffer,
103 index_adv: Buffer,
104 frames: Option<Buffer>,
105 cntlist: Option<Buffer>,
106}
107
108impl DictFiles {
109 fn load(dict_dir: &Path, mode: LoadMode) -> Result<Self> {
110 let data_noun = load_file(dict_dir.join("data.noun"), mode)?;
111 let data_verb = load_file(dict_dir.join("data.verb"), mode)?;
112 let data_adj = load_file(dict_dir.join("data.adj"), mode)?;
113 let data_adv = load_file(dict_dir.join("data.adv"), mode)?;
114 let index_noun = load_file(dict_dir.join("index.noun"), mode)?;
115 let index_verb = load_file(dict_dir.join("index.verb"), mode)?;
116 let index_adj = load_file(dict_dir.join("index.adj"), mode)?;
117 let index_adv = load_file(dict_dir.join("index.adv"), mode)?;
118 let frames = load_optional_file(dict_dir.join("frames.vrb"), mode)?;
119 let cntlist = load_optional_file(dict_dir.join("cntlist.rev"), mode)?;
120
121 Ok(Self {
122 data_noun,
123 data_verb,
124 data_adj,
125 data_adv,
126 index_noun,
127 index_verb,
128 index_adj,
129 index_adv,
130 frames,
131 cntlist,
132 })
133 }
134
135 fn bytes(&self, file: FileKind) -> &[u8] {
136 match file {
137 FileKind::DataNoun => self.data_noun.as_slice(),
138 FileKind::DataVerb => self.data_verb.as_slice(),
139 FileKind::DataAdj => self.data_adj.as_slice(),
140 FileKind::DataAdv => self.data_adv.as_slice(),
141 FileKind::IndexNoun => self.index_noun.as_slice(),
142 FileKind::IndexVerb => self.index_verb.as_slice(),
143 FileKind::IndexAdj => self.index_adj.as_slice(),
144 FileKind::IndexAdv => self.index_adv.as_slice(),
145 FileKind::Frames => self.frames.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
146 FileKind::Cntlist => self.cntlist.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
147 }
148 }
149
150 fn text(&self, r: TextRef) -> &str {
151 let bytes = self.bytes(r.file);
152 let slice = &bytes[r.start..r.start + r.len];
153 std::str::from_utf8(slice).expect("wordnet text is valid utf8")
154 }
155}
156
157struct LemmaData {
158 text: TextRef,
159 lex_id: u8,
160}
161
162struct PointerData {
163 symbol: TextRef,
164 target: SynsetId,
165 src_word: Option<u16>,
166 dst_word: Option<u16>,
167}
168
169struct GlossData {
170 raw: TextRef,
171 definition: TextRef,
172 examples: Vec<TextRef>,
173}
174
175struct SynsetData {
176 id: SynsetId,
177 lex_filenum: u8,
178 synset_type: SynsetType,
179 words: Vec<LemmaData>,
180 pointers: Vec<PointerData>,
181 frames: Vec<Frame>,
182 gloss: GlossData,
183}
184
185struct IndexEntryData {
186 lemma: TextRef,
187 synset_cnt: u32,
188 p_cnt: u32,
189 ptr_symbols: Vec<TextRef>,
190 sense_cnt: u32,
191 tagsense_cnt: u32,
192 synset_offsets: Vec<u32>,
193}
194
195pub struct WordNet {
197 files: DictFiles,
198 index: HashMap<(Pos, String), IndexEntryData>,
199 synsets: HashMap<SynsetId, SynsetData>,
200 lemma_to_synsets: HashMap<(Pos, String), Vec<SynsetId>>,
201 verb_frames_text: HashMap<u16, TextRef>,
202 sense_counts: HashMap<(String, Pos, u32), u32>,
203}
204
205impl WordNet {
206 pub fn load(dict_dir: impl AsRef<Path>) -> Result<Self> {
211 Self::load_with_mode(dict_dir, LoadMode::Mmap)
212 }
213
214 pub fn load_with_mode(dict_dir: impl AsRef<Path>, mode: LoadMode) -> Result<Self> {
216 let dir = dict_dir.as_ref();
217 let required = [
218 "data.noun",
219 "data.verb",
220 "data.adj",
221 "data.adv",
222 "index.noun",
223 "index.verb",
224 "index.adj",
225 "index.adv",
226 ];
227 for name in &required {
228 let path = dir.join(name);
229 if !path.exists() {
230 anyhow::bail!("missing required WordNet file: {}", path.display());
231 }
232 }
233
234 let files = DictFiles::load(dir, mode)?;
235
236 let mut index = HashMap::new();
237 let mut lemma_to_synsets = HashMap::new();
238 parse_index(
239 files.bytes(FileKind::IndexNoun),
240 FileKind::IndexNoun,
241 Pos::Noun,
242 &mut index,
243 &mut lemma_to_synsets,
244 )?;
245 parse_index(
246 files.bytes(FileKind::IndexVerb),
247 FileKind::IndexVerb,
248 Pos::Verb,
249 &mut index,
250 &mut lemma_to_synsets,
251 )?;
252 parse_index(
253 files.bytes(FileKind::IndexAdj),
254 FileKind::IndexAdj,
255 Pos::Adj,
256 &mut index,
257 &mut lemma_to_synsets,
258 )?;
259 parse_index(
260 files.bytes(FileKind::IndexAdv),
261 FileKind::IndexAdv,
262 Pos::Adv,
263 &mut index,
264 &mut lemma_to_synsets,
265 )?;
266
267 let mut synsets = HashMap::new();
268 parse_data(
269 files.bytes(FileKind::DataNoun),
270 FileKind::DataNoun,
271 Pos::Noun,
272 &mut synsets,
273 )?;
274 parse_data(
275 files.bytes(FileKind::DataVerb),
276 FileKind::DataVerb,
277 Pos::Verb,
278 &mut synsets,
279 )?;
280 parse_data(
281 files.bytes(FileKind::DataAdj),
282 FileKind::DataAdj,
283 Pos::Adj,
284 &mut synsets,
285 )?;
286 parse_data(
287 files.bytes(FileKind::DataAdv),
288 FileKind::DataAdv,
289 Pos::Adv,
290 &mut synsets,
291 )?;
292
293 let verb_frames_text = parse_frames_vrb(files.bytes(FileKind::Frames));
294 let sense_counts = parse_cntlist(files.bytes(FileKind::Cntlist));
295
296 Ok(Self {
297 files,
298 index,
299 synsets,
300 lemma_to_synsets,
301 verb_frames_text,
302 sense_counts,
303 })
304 }
305
306 pub fn lemma_exists(&self, pos: Pos, lemma: &str) -> bool {
308 let key = (pos, normalize_lemma(lemma));
309 self.lemma_to_synsets.contains_key(&key)
310 }
311
312 pub fn index_entry(&self, pos: Pos, lemma: &str) -> Option<IndexEntry<'_>> {
314 let key = (pos, normalize_lemma(lemma));
315 self.index.get(&key).map(|entry| IndexEntry {
316 lemma: self.files.text(entry.lemma),
317 pos,
318 synset_cnt: entry.synset_cnt,
319 p_cnt: entry.p_cnt,
320 ptr_symbols: entry
321 .ptr_symbols
322 .iter()
323 .map(|r| self.files.text(*r))
324 .collect(),
325 sense_cnt: entry.sense_cnt,
326 tagsense_cnt: entry.tagsense_cnt,
327 synset_offsets: entry.synset_offsets.as_slice(),
328 })
329 }
330
331 pub fn synsets_for_lemma(&self, pos: Pos, lemma: &str) -> &[SynsetId] {
333 static EMPTY: [SynsetId; 0] = [];
334 let key = (pos, normalize_lemma(lemma));
335 self.lemma_to_synsets
336 .get(&key)
337 .map(|v| v.as_slice())
338 .unwrap_or(&EMPTY)
339 }
340
341 pub fn get_synset(&self, id: SynsetId) -> Option<Synset<'_>> {
343 self.synsets.get(&id).map(|syn| self.make_synset_view(syn))
344 }
345
346 pub fn iter_synsets(&self) -> impl Iterator<Item = Synset<'_>> + '_ {
348 self.synsets.values().map(|s| self.make_synset_view(s))
349 }
350
351 pub fn index_count(&self) -> usize {
353 self.index.len()
354 }
355
356 pub fn lemma_count(&self) -> usize {
358 self.lemma_to_synsets.len()
359 }
360
361 pub fn synset_count(&self) -> usize {
363 self.synsets.len()
364 }
365
366 pub fn verb_frame_templates_count(&self) -> usize {
368 self.verb_frames_text.len()
369 }
370
371 pub fn sense_count_entries(&self) -> usize {
373 self.sense_counts.len()
374 }
375
376 pub fn sense_count(&self, pos: Pos, lemma: &str, synset_offset: u32) -> Option<u32> {
378 let normalized = normalize_lemma(lemma);
379 let entry = self.index.get(&(pos, normalized.clone()))?;
380 let sense_number = entry
381 .synset_offsets
382 .iter()
383 .position(|off| *off == synset_offset)?;
384 let sense_number = sense_number as u32 + 1;
385 self.sense_counts
386 .get(&(normalized, pos, sense_number))
387 .copied()
388 }
389
390 fn make_synset_view<'a>(&'a self, data: &'a SynsetData) -> Synset<'a> {
391 let words = data
392 .words
393 .iter()
394 .map(|w| Lemma {
395 text: self.files.text(w.text),
396 lex_id: w.lex_id,
397 })
398 .collect();
399 let pointers = data
400 .pointers
401 .iter()
402 .map(|p| Pointer {
403 symbol: self.files.text(p.symbol),
404 target: p.target,
405 src_word: p.src_word,
406 dst_word: p.dst_word,
407 })
408 .collect();
409 let gloss = Gloss {
410 raw: self.files.text(data.gloss.raw),
411 definition: self.files.text(data.gloss.definition),
412 examples: data
413 .gloss
414 .examples
415 .iter()
416 .map(|r| self.files.text(*r))
417 .collect(),
418 };
419
420 Synset {
421 id: data.id,
422 lex_filenum: data.lex_filenum,
423 synset_type: data.synset_type,
424 words,
425 pointers,
426 frames: data.frames.as_slice(),
427 gloss,
428 }
429 }
430}
431
432fn load_file(path: PathBuf, mode: LoadMode) -> Result<Buffer> {
433 match mode {
434 LoadMode::Mmap => {
435 let file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
436 unsafe { Mmap::map(&file) }
437 .map(Buffer::Mmap)
438 .with_context(|| format!("mmap {}", path.display()))
439 }
440 LoadMode::Owned => {
441 let mut file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
442 let mut buf = Vec::new();
443 file.read_to_end(&mut buf)
444 .with_context(|| format!("read {}", path.display()))?;
445 Ok(Buffer::Owned(buf))
446 }
447 }
448}
449
450fn load_optional_file(path: PathBuf, mode: LoadMode) -> Result<Option<Buffer>> {
451 if !path.exists() {
452 return Ok(None);
453 }
454 load_file(path, mode).map(Some)
455}
456
457fn parse_index(
458 bytes: &[u8],
459 file: FileKind,
460 pos: Pos,
461 index: &mut HashMap<(Pos, String), IndexEntryData>,
462 lemma_to_synsets: &mut HashMap<(Pos, String), Vec<SynsetId>>,
463) -> Result<()> {
464 for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
465 let line = strip_cr(raw_line);
466 if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
467 continue;
468 }
469 let line_str = std::str::from_utf8(line)?;
470 let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
471 if tokens.len() < 6 {
472 anyhow::bail!(
473 "{:?}:{} malformed index line (too few tokens)",
474 file,
475 lineno + 1
476 );
477 }
478
479 let lemma_token = tokens[0];
480 let lemma_ref = text_ref_str(file, bytes, lemma_token);
481 let lemma_key = normalize_lemma(lemma_token);
482
483 let synset_cnt: u32 = tokens[2]
484 .parse()
485 .with_context(|| format!("index {:?}:{} synset_cnt", file, lineno + 1))?;
486 let p_cnt: u32 = tokens[3]
487 .parse()
488 .with_context(|| format!("index {:?}:{} p_cnt", file, lineno + 1))?;
489
490 let expected_ptrs = p_cnt as usize;
491 let mut idx = 4;
492 if tokens.len() < idx + expected_ptrs {
493 anyhow::bail!("{:?}:{} pointer count mismatch", file, lineno + 1);
494 }
495 let ptr_symbols = tokens[idx..idx + expected_ptrs]
496 .iter()
497 .map(|sym| text_ref_str(file, bytes, sym))
498 .collect::<Vec<_>>();
499 idx += expected_ptrs;
500 if tokens.len() < idx + 2 {
501 anyhow::bail!("{:?}:{} missing sense counts", file, lineno + 1);
502 }
503 let sense_cnt: u32 = tokens[idx]
504 .parse()
505 .with_context(|| format!("index {:?}:{} sense_cnt", file, lineno + 1))?;
506 idx += 1;
507 let tagsense_cnt: u32 = tokens[idx]
508 .parse()
509 .with_context(|| format!("index {:?}:{} tagsense_cnt", file, lineno + 1))?;
510 idx += 1;
511
512 let offsets: Vec<u32> = tokens[idx..]
513 .iter()
514 .map(|t| {
515 t.parse::<u32>()
516 .with_context(|| format!("index {:?}:{} synset_offsets", file, lineno + 1))
517 })
518 .collect::<Result<_>>()?;
519 if offsets.len() != synset_cnt as usize {
520 anyhow::bail!(
521 "{:?}:{} synset_cnt mismatch (expected {}, got {})",
522 file,
523 lineno + 1,
524 synset_cnt,
525 offsets.len()
526 );
527 }
528
529 index.insert(
530 (pos, lemma_key.clone()),
531 IndexEntryData {
532 lemma: lemma_ref,
533 synset_cnt,
534 p_cnt,
535 ptr_symbols,
536 sense_cnt,
537 tagsense_cnt,
538 synset_offsets: offsets.clone(),
539 },
540 );
541 lemma_to_synsets.insert(
542 (pos, lemma_key),
543 offsets
544 .into_iter()
545 .map(|offset| SynsetId { pos, offset })
546 .collect(),
547 );
548 }
549
550 Ok(())
551}
552
553fn parse_data(
554 bytes: &[u8],
555 file: FileKind,
556 pos: Pos,
557 synsets: &mut HashMap<SynsetId, SynsetData>,
558) -> Result<()> {
559 for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
560 let line = strip_cr(raw_line);
561 if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
562 continue;
563 }
564 let line_str = std::str::from_utf8(line)?;
565 let (left, gloss_part) = match line_str.split_once('|') {
566 Some((l, r)) => (l.trim(), r.trim()),
567 None => (line_str.trim(), ""),
568 };
569
570 let tokens: Vec<&str> = left.split_ascii_whitespace().collect();
571 if tokens.len() < 4 {
572 anyhow::bail!("{:?}:{} malformed data line", file, lineno + 1);
573 }
574
575 let offset: u32 = tokens[0]
576 .parse()
577 .with_context(|| format!("{:?}:{} offset", file, lineno + 1))?;
578 let lex_filenum: u8 = tokens[1]
579 .parse()
580 .with_context(|| format!("{:?}:{} lex_filenum", file, lineno + 1))?;
581 let ss_type_char = tokens[2]
582 .chars()
583 .next()
584 .ok_or_else(|| anyhow::anyhow!("{:?}:{} missing ss_type", file, lineno + 1))?;
585 let synset_type = SynsetType::from_char(ss_type_char).ok_or_else(|| {
586 anyhow::anyhow!("{:?}:{} invalid ss_type {}", file, lineno + 1, ss_type_char)
587 })?;
588 let w_cnt: usize = usize::from_str_radix(tokens[3], 16)
589 .with_context(|| format!("{:?}:{} w_cnt", file, lineno + 1))?;
590
591 let mut idx = 4;
592 if tokens.len() < idx + (w_cnt * 2) {
593 anyhow::bail!("{:?}:{} not enough word/lex_id pairs", file, lineno + 1);
594 }
595 let mut words = Vec::with_capacity(w_cnt);
596 for _ in 0..w_cnt {
597 let text_token = tokens[idx];
598 let lex_id_token = tokens[idx + 1];
599 let lex_id: u8 = u8::from_str_radix(lex_id_token, 16)
600 .with_context(|| format!("{:?}:{} lex_id", file, lineno + 1))?;
601 words.push(LemmaData {
602 text: text_ref_str(file, bytes, text_token),
603 lex_id,
604 });
605 idx += 2;
606 }
607
608 if tokens.len() <= idx {
609 anyhow::bail!("{:?}:{} missing pointer count", file, lineno + 1);
610 }
611 let p_cnt: usize = tokens[idx]
612 .parse()
613 .with_context(|| format!("{:?}:{} p_cnt", file, lineno + 1))?;
614 idx += 1;
615
616 let mut pointers = Vec::with_capacity(p_cnt);
617 for _ in 0..p_cnt {
618 if tokens.len() < idx + 4 {
619 anyhow::bail!("{:?}:{} incomplete pointer block", file, lineno + 1);
620 }
621 let symbol = tokens[idx];
622 let target_offset: u32 = tokens[idx + 1]
623 .parse()
624 .with_context(|| format!("{:?}:{} pointer target offset", file, lineno + 1))?;
625 let target_pos = tokens[idx + 2]
626 .chars()
627 .next()
628 .and_then(Pos::from_char)
629 .ok_or_else(|| anyhow::anyhow!("{:?}:{} pointer target pos", file, lineno + 1))?;
630 let (src_word, dst_word) = decode_st(tokens[idx + 3]);
631 pointers.push(PointerData {
632 symbol: text_ref_str(file, bytes, symbol),
633 target: SynsetId {
634 pos: target_pos,
635 offset: target_offset,
636 },
637 src_word,
638 dst_word,
639 });
640 idx += 4;
641 }
642
643 let mut frames = Vec::new();
644 if matches!(pos, Pos::Verb) {
645 let f_cnt: usize = if tokens.len() <= idx {
646 0
647 } else {
648 let v: usize = tokens[idx]
649 .parse()
650 .with_context(|| format!("{:?}:{} f_cnt", file, lineno + 1))?;
651 idx += 1;
652 v
653 };
654 for _ in 0..f_cnt {
655 if tokens.len() < idx + 3 {
656 anyhow::bail!("{:?}:{} incomplete frame entry", file, lineno + 1);
657 }
658 if tokens[idx] != "+" {
659 anyhow::bail!("{:?}:{} expected '+' before frame entry", file, lineno + 1);
660 }
661 let frame_number: u16 = tokens[idx + 1]
662 .parse()
663 .with_context(|| format!("{:?}:{} frame_number", file, lineno + 1))?;
664 let word_number = parse_word_number(tokens[idx + 2]);
665 frames.push(Frame {
666 frame_number,
667 word_number,
668 });
669 idx += 3;
670 }
671 }
672
673 let gloss = parse_gloss(file, bytes, gloss_part)?;
674 let id = SynsetId { pos, offset };
675 synsets.insert(
676 id,
677 SynsetData {
678 id,
679 lex_filenum,
680 synset_type,
681 words,
682 pointers,
683 frames,
684 gloss,
685 },
686 );
687 }
688
689 Ok(())
690}
691
692fn parse_gloss(file: FileKind, root: &[u8], gloss: &str) -> Result<GlossData> {
693 let trimmed = gloss.trim();
694 let gloss_raw = text_ref_str(file, root, trimmed);
695
696 let mut examples = Vec::new();
697 let mut in_quote = false;
698 let mut quote_start: Option<usize> = None;
699 let mut def_end = trimmed.len();
700 for (idx, ch) in trimmed.char_indices() {
701 match ch {
702 '"' => {
703 if in_quote {
704 if let Some(start) = quote_start.take()
705 && idx > start + 1
706 {
707 let start_bytes =
708 trimmed.as_ptr() as usize + start + 1 - root.as_ptr() as usize;
709 examples.push(TextRef {
710 file,
711 start: start_bytes,
712 len: idx - start - 1,
713 });
714 }
715 } else {
716 quote_start = Some(idx);
717 }
718 in_quote = !in_quote;
719 }
720 ';' if !in_quote && def_end == trimmed.len() => {
721 def_end = idx;
722 }
723 _ => {}
724 }
725 }
726
727 let definition_slice = trimmed[..def_end].trim();
728 let def_start = definition_slice.as_ptr() as usize - trimmed.as_ptr() as usize;
729
730 let definition = TextRef {
731 file,
732 start: trimmed.as_ptr() as usize + def_start - root.as_ptr() as usize,
733 len: definition_slice.len(),
734 };
735
736 Ok(GlossData {
737 raw: gloss_raw,
738 definition,
739 examples,
740 })
741}
742
743fn parse_frames_vrb(bytes: &[u8]) -> HashMap<u16, TextRef> {
744 let mut frames = HashMap::new();
745 for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
746 let line = strip_cr(raw_line);
747 if line.is_empty() {
748 continue;
749 }
750 let line_str = match std::str::from_utf8(line) {
751 Ok(s) => s,
752 Err(_) => continue,
753 };
754 let mut parts = line_str.splitn(2, ' ');
755 let num = parts.next().and_then(|t| t.parse::<u16>().ok());
756 let text = parts.next().map(str::trim).unwrap_or("");
757 if let Some(n) = num {
758 let start = text.as_ptr() as usize - bytes.as_ptr() as usize;
759 frames.insert(
760 n,
761 TextRef {
762 file: FileKind::Frames,
763 start,
764 len: text.len(),
765 },
766 );
767 } else {
768 eprintln!("frames.vrb:{} invalid frame number", lineno + 1);
769 }
770 }
771 frames
772}
773
774fn parse_cntlist(bytes: &[u8]) -> HashMap<(String, Pos, u32), u32> {
775 let mut counts = HashMap::new();
776 for raw_line in bytes.split(|b| *b == b'\n') {
777 let line = strip_cr(raw_line);
778 if line.is_empty() {
779 continue;
780 }
781 let line_str = match std::str::from_utf8(line) {
782 Ok(s) => s,
783 Err(_) => continue,
784 };
785 let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
786 if tokens.len() < 3 {
787 continue;
788 }
789 let count: u32 = match tokens[0].parse() {
790 Ok(c) => c,
791 Err(_) => continue,
792 };
793 let lemma = normalize_lemma(tokens[1]);
795 let pos = tokens[2]
796 .chars()
797 .next()
798 .and_then(Pos::from_char)
799 .unwrap_or(Pos::Noun);
800 let sense_number: u32 = tokens.get(3).and_then(|t| t.parse().ok()).unwrap_or(1);
801 counts.insert((lemma, pos, sense_number), count);
802 }
803 counts
804}
805
806fn text_ref_str(file: FileKind, root: &[u8], token: &str) -> TextRef {
807 let start = token.as_ptr() as usize - root.as_ptr() as usize;
808 TextRef {
809 file,
810 start,
811 len: token.len(),
812 }
813}
814
815fn strip_cr(line: &[u8]) -> &[u8] {
816 if line.ends_with(b"\r") {
817 &line[..line.len() - 1]
818 } else {
819 line
820 }
821}
822
823fn parse_word_number(token: &str) -> Option<u16> {
824 u16::from_str_radix(token, 16)
825 .or_else(|_| token.parse::<u16>())
826 .ok()
827 .and_then(|v| if v == 0 { None } else { Some(v) })
828}
829
830fn normalize_lemma(text: &str) -> String {
831 let mut s = text.trim().to_string();
832 s.make_ascii_lowercase();
833 s.replace(' ', "_")
834}