1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use std::borrow::Borrow;
use std::iter::Iterator;

mod matrix;
pub use self::matrix::Matrix;

mod morph;
pub use self::morph::Morph;

pub mod fst;
use self::fst::Fst;

pub mod unknown;

pub trait Dic<'a> {
    type Iterator: Iterator<Item = Morph<&'a str>>;
    fn lookup_iter(&'a self, input: &'a [u8]) -> Self::Iterator;
    fn lookup(&'a self, input: &'a [u8]) -> Vec<Morph<&'a str>> {
        self.lookup_iter(input).collect()
    }
    fn lookup_str_iter(&'a self, input: &'a str) -> Self::Iterator {
        self.lookup_iter(input.as_bytes())
    }
    fn lookup_str(&'a self, input: &'a str) -> Vec<Morph<&'a str>> {
        self.lookup_str_iter(input).collect()
    }
}

#[derive(Debug, Clone)]
pub struct FstDic<T: Borrow<[u8]>> {
    morph_bytes: T,
    fst: Fst<T>,
}

impl<'a> FstDic<&'a [u8]> {
    pub unsafe fn from_bytes(bytecodes: &'a [u8], morph_bytes: &'a [u8]) -> Self {
        FstDic {
            morph_bytes: morph_bytes,
            fst: Fst::from_bytes(bytecodes),
        }
    }
}

impl FstDic<Vec<u8>> {
    pub fn build<S: Borrow<str>>(morphs: &[Morph<S>]) -> Self {
        let mut morph_bytes = Vec::new();
        let mut fst_inputs = Vec::new();
        for morph in morphs {
            let offset = morph_bytes.len();
            let surface = morph.surface.borrow().as_bytes();
            fst_inputs.push((surface, offset as u32));
            morph.encode_native(&mut morph_bytes).unwrap();
        }
        fst_inputs.sort();
        let fst = Fst::build(fst_inputs);
        FstDic {
            morph_bytes: morph_bytes,
            fst: fst,
        }
    }
}

impl<'a, T: Borrow<[u8]>> Dic<'a> for FstDic<T> {
    type Iterator = Iter<'a>;

    fn lookup_iter(&'a self, input: &'a [u8]) -> Iter<'a> {
        Iter {
            morph_bytes: self.morph_bytes.borrow(),
            iter: self.fst.run_iter(input),
        }
    }
}

pub struct Iter<'a> {
    morph_bytes: &'a [u8],
    iter: fst::Iter<'a>,
}

impl<'a> Iter<'a> {
    fn fetch_entry(&self, offset: usize) -> Morph<&'a str> {
        let entry_bytes = &self.morph_bytes[offset..];
        unsafe { Morph::decode(entry_bytes) }
    }
}

impl<'a> Iterator for Iter<'a> {
    type Item = Morph<&'a str>;

    fn next(&mut self) -> Option<Self::Item> {
        self.iter.next().map(|acc| self.fetch_entry(acc.0 as usize))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_build_lookup() {
        let morphs = vec![Morph {
                              surface: "す",
                              left_id: 1,
                              right_id: 1,
                              weight: 1,
                              contents: "contents 1",
                          },
                          Morph {
                              surface: "す",
                              left_id: 2,
                              right_id: 2,
                              weight: 2,
                              contents: "contents 2",
                          },
                          Morph {
                              surface: "すも",
                              left_id: 3,
                              right_id: 3,
                              weight: 3,
                              contents: "contents 3",
                          },
                          Morph {
                              surface: "すもも",
                              left_id: 4,
                              right_id: 4,
                              weight: 4,
                              contents: "contents 4",
                          }];
        let dict = FstDic::build(&morphs);
        let results = dict.lookup_str("すもも");
        assert_eq!(results.len(), morphs.len());
        // the order of lookup results is not fixed.
        for result in results {
            assert!(morphs.iter().any(|m| *m == result),
                    "invalid result: {:?}",
                    result);
        }
    }
}