1use super::id::WordIdentifier;
2use indexmap::IndexSet;
3use serde::{Deserialize, Serialize};
4use std::str::from_utf8_unchecked;
5
6#[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
7pub struct WordFeaturesMap {
8 #[serde(with = "serde_bytes")]
9 index: Vec<u8>,
10 offsets: Vec<usize>,
11 known: Vec<WordFeatures>, unknown: Vec<WordFeatures>, }
14impl WordFeaturesMap {
15 pub fn new(known: Vec<Vec<String>>, unknown: Vec<Vec<String>>) -> WordFeaturesMap {
16 let mut tmp_index: IndexSet<String> = IndexSet::new();
17 for features in known.iter().chain(unknown.iter()) {
18 for f in features.iter() {
19 tmp_index.insert(f.to_string());
20 }
21 }
22 let mut index = vec![];
23 let mut offsets: Vec<usize> = vec![0; tmp_index.len()];
24 offsets[0] = tmp_index.get_index(0).unwrap().as_bytes().len();
25 for (idx, str) in tmp_index.iter().enumerate() {
26 index.append(&mut str.to_string().into_bytes());
27 if idx > 0 {
28 offsets[idx] = offsets[idx - 1] + str.as_bytes().len();
29 }
30 }
31
32 WordFeaturesMap {
33 known: known
34 .into_iter()
35 .map(|f| {
36 WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
37 })
38 .collect(),
39 unknown: unknown
40 .into_iter()
41 .map(|f| {
42 WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
43 })
44 .collect(),
45 index,
46 offsets,
47 }
48 }
49
50 pub fn get(&self, wid: &WordIdentifier) -> Option<Vec<&str>> {
51 match wid {
52 WordIdentifier::Known(wid, _) => self.get_known(wid),
53 WordIdentifier::Unknown(wid, _) => self.get_unknown(wid),
54 }
55 }
56
57 pub fn get_known(&self, wid: &usize) -> Option<Vec<&str>> {
58 self.known.get(*wid).map(|f| self.get_string(f))
59 }
60
61 pub fn get_unknown(&self, wid: &usize) -> Option<Vec<&str>> {
62 self.unknown.get(*wid).map(|f| self.get_string(f))
63 }
64
65 fn get_string(&self, f: &WordFeatures) -> Vec<&str> {
66 f.0.iter()
67 .map(|idx| {
68 let idx = *idx;
69 let end = self.offsets[idx];
70 if idx == 0 {
71 unsafe { from_utf8_unchecked(&self.index[0..end]) }
72 } else {
73 unsafe { from_utf8_unchecked(&self.index[(self.offsets[idx - 1])..end]) }
74 }
75 })
76 .collect()
77 }
78}
79
80#[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
83pub struct WordFeatures(Vec<usize>);
84impl WordFeatures {
85 pub fn new(features: Vec<usize>) -> WordFeatures {
86 WordFeatures(features)
87 }
88}