yoin_core/dic/
mod.rs

1use std::borrow::Borrow;
2use std::iter::Iterator;
3
4mod matrix;
5pub use self::matrix::Matrix;
6
7mod morph;
8pub use self::morph::Morph;
9
10pub mod fst;
11use self::fst::Fst;
12
13pub mod unknown;
14
15pub trait Dic<'a> {
16    type Iterator: Iterator<Item = Morph<&'a str>>;
17    fn lookup_iter(&'a self, input: &'a [u8]) -> Self::Iterator;
18    fn lookup(&'a self, input: &'a [u8]) -> Vec<Morph<&'a str>> {
19        self.lookup_iter(input).collect()
20    }
21    fn lookup_str_iter(&'a self, input: &'a str) -> Self::Iterator {
22        self.lookup_iter(input.as_bytes())
23    }
24    fn lookup_str(&'a self, input: &'a str) -> Vec<Morph<&'a str>> {
25        self.lookup_str_iter(input).collect()
26    }
27}
28
29#[derive(Debug, Clone)]
30pub struct FstDic<T: Borrow<[u8]>> {
31    morph_bytes: T,
32    fst: Fst<T>,
33}
34
35impl<'a> FstDic<&'a [u8]> {
36    pub unsafe fn from_bytes(bytecodes: &'a [u8], morph_bytes: &'a [u8]) -> Self {
37        FstDic {
38            morph_bytes: morph_bytes,
39            fst: Fst::from_bytes(bytecodes),
40        }
41    }
42}
43
44impl FstDic<Vec<u8>> {
45    pub fn build<S: Borrow<str>>(morphs: &[Morph<S>]) -> Self {
46        let mut morph_bytes = Vec::new();
47        let mut fst_inputs = Vec::new();
48        for morph in morphs {
49            let offset = morph_bytes.len();
50            let surface = morph.surface.borrow().as_bytes();
51            fst_inputs.push((surface, offset as u32));
52            morph.encode_native(&mut morph_bytes).unwrap();
53        }
54        fst_inputs.sort();
55        let fst = Fst::build(fst_inputs);
56        FstDic {
57            morph_bytes: morph_bytes,
58            fst: fst,
59        }
60    }
61}
62
63impl<'a, T: Borrow<[u8]>> Dic<'a> for FstDic<T> {
64    type Iterator = Iter<'a>;
65
66    fn lookup_iter(&'a self, input: &'a [u8]) -> Iter<'a> {
67        Iter {
68            morph_bytes: self.morph_bytes.borrow(),
69            iter: self.fst.run_iter(input),
70        }
71    }
72}
73
74pub struct Iter<'a> {
75    morph_bytes: &'a [u8],
76    iter: fst::Iter<'a>,
77}
78
79impl<'a> Iter<'a> {
80    fn fetch_entry(&self, offset: usize) -> Morph<&'a str> {
81        let entry_bytes = &self.morph_bytes[offset..];
82        unsafe { Morph::decode(entry_bytes) }
83    }
84}
85
86impl<'a> Iterator for Iter<'a> {
87    type Item = Morph<&'a str>;
88
89    fn next(&mut self) -> Option<Self::Item> {
90        self.iter.next().map(|acc| self.fetch_entry(acc.0 as usize))
91    }
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    #[test]
99    fn test_build_lookup() {
100        let morphs = vec![Morph {
101                              surface: "す",
102                              left_id: 1,
103                              right_id: 1,
104                              weight: 1,
105                              contents: "contents 1",
106                          },
107                          Morph {
108                              surface: "す",
109                              left_id: 2,
110                              right_id: 2,
111                              weight: 2,
112                              contents: "contents 2",
113                          },
114                          Morph {
115                              surface: "すも",
116                              left_id: 3,
117                              right_id: 3,
118                              weight: 3,
119                              contents: "contents 3",
120                          },
121                          Morph {
122                              surface: "すもも",
123                              left_id: 4,
124                              right_id: 4,
125                              weight: 4,
126                              contents: "contents 4",
127                          }];
128        let dict = FstDic::build(&morphs);
129        let results = dict.lookup_str("すもも");
130        assert_eq!(results.len(), morphs.len());
131        // the order of lookup results is not fixed.
132        for result in results {
133            assert!(morphs.iter().any(|m| *m == result),
134                    "invalid result: {:?}",
135                    result);
136        }
137    }
138}