goya/
word_features.rs

1use super::id::WordIdentifier;
2use indexmap::IndexSet;
3use serde::{Deserialize, Serialize};
4use std::str::from_utf8_unchecked;
5
6#[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
7pub struct WordFeaturesMap {
8    #[serde(with = "serde_bytes")]
9    index: Vec<u8>,
10    offsets: Vec<usize>,
11    known: Vec<WordFeatures>,   // index = morpheme ID
12    unknown: Vec<WordFeatures>, // index = morpheme ID
13}
14impl WordFeaturesMap {
15    pub fn new(known: Vec<Vec<String>>, unknown: Vec<Vec<String>>) -> WordFeaturesMap {
16        let mut tmp_index: IndexSet<String> = IndexSet::new();
17        for features in known.iter().chain(unknown.iter()) {
18            for f in features.iter() {
19                tmp_index.insert(f.to_string());
20            }
21        }
22        let mut index = vec![];
23        let mut offsets: Vec<usize> = vec![0; tmp_index.len()];
24        offsets[0] = tmp_index.get_index(0).unwrap().as_bytes().len();
25        for (idx, str) in tmp_index.iter().enumerate() {
26            index.append(&mut str.to_string().into_bytes());
27            if idx > 0 {
28                offsets[idx] = offsets[idx - 1] + str.as_bytes().len();
29            }
30        }
31
32        WordFeaturesMap {
33            known: known
34                .into_iter()
35                .map(|f| {
36                    WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
37                })
38                .collect(),
39            unknown: unknown
40                .into_iter()
41                .map(|f| {
42                    WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
43                })
44                .collect(),
45            index,
46            offsets,
47        }
48    }
49
50    pub fn get(&self, wid: &WordIdentifier) -> Option<Vec<&str>> {
51        match wid {
52            WordIdentifier::Known(wid, _) => self.get_known(wid),
53            WordIdentifier::Unknown(wid, _) => self.get_unknown(wid),
54        }
55    }
56
57    pub fn get_known(&self, wid: &usize) -> Option<Vec<&str>> {
58        self.known.get(*wid).map(|f| self.get_string(f))
59    }
60
61    pub fn get_unknown(&self, wid: &usize) -> Option<Vec<&str>> {
62        self.unknown.get(*wid).map(|f| self.get_string(f))
63    }
64
65    fn get_string(&self, f: &WordFeatures) -> Vec<&str> {
66        f.0.iter()
67            .map(|idx| {
68                let idx = *idx;
69                let end = self.offsets[idx];
70                if idx == 0 {
71                    unsafe { from_utf8_unchecked(&self.index[0..end]) }
72                } else {
73                    unsafe { from_utf8_unchecked(&self.index[(self.offsets[idx - 1])..end]) }
74                }
75            })
76            .collect()
77    }
78}
79
80/// > 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます.
81/// > https://taku910.github.io/mecab/dic-detail.html
82#[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
83pub struct WordFeatures(Vec<usize>);
84impl WordFeatures {
85    pub fn new(features: Vec<usize>) -> WordFeatures {
86        WordFeatures(features)
87    }
88}