yoin_core/tokenizer/
mod.rs

1use std::iter::Iterator;
2use std::str::Split;
3use std::fmt;
4
5mod lattice;
6use self::lattice::{Lattice, Node, NodeKind};
7use sysdic::SysDic;
8use dic::FstDic;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct Token<'a> {
12    start: usize,
13    surface: &'a str,
14    contents: &'a str,
15}
16
17impl<'a> Token<'a> {
18    fn new(node: Node<'a>) -> Self {
19        let Node { start, kind } = node;
20        let (surface, contents) = match kind {
21            NodeKind::BOS | NodeKind::EOS => unreachable!(),
22            NodeKind::Known(morph) => (morph.surface, morph.contents),
23            NodeKind::Unknown(surface, entry) => (surface, entry.contents),
24        };
25
26        Token {
27            start: start,
28            surface: surface,
29            contents: contents,
30        }
31    }
32
33    pub fn surface(&self) -> &str {
34        self.surface
35    }
36
37    pub fn start(&self) -> usize {
38        self.start
39    }
40
41    pub fn end(&self) -> usize {
42        self.start + self.surface().len()
43    }
44
45    pub fn features(&self) -> FeatureIter {
46        FeatureIter(self.contents.split(','))
47    }
48}
49
50impl<'a> fmt::Display for Token<'a> {
51    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
52        write!(f, "{}\t{}", self.surface, self.contents)
53    }
54}
55
56pub struct FeatureIter<'a>(Split<'a, char>);
57
58impl<'a> Iterator for FeatureIter<'a> {
59    type Item = &'a str;
60
61    fn next(&mut self) -> Option<Self::Item> {
62        self.0.next()
63    }
64}
65
66pub struct Tokenizer<'a> {
67    sysdic: SysDic,
68    udic: Option<FstDic<&'a [u8]>>,
69}
70
71impl<'a> Tokenizer<'a> {
72    pub fn new(sysdic: SysDic) -> Self {
73        Tokenizer { sysdic: sysdic, udic: None }
74    }
75
76    pub fn with_udic<'b>(self, udic: FstDic<&'b [u8]>) -> Tokenizer<'b> {
77        Tokenizer { sysdic: self.sysdic, udic: Some(udic) }
78    }
79
80    pub fn tokenize(&'a self, input: &'a str) -> Vec<Token<'a>> {
81        let la = Lattice::build(input, &self.sysdic, self.udic.as_ref());
82        la.into_output().into_iter().map(|node| Token::new(node)).collect()
83    }
84}