1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
use std::iter::Iterator;
use std::str::Split;
use std::fmt;

mod lattice;
use self::lattice::{Lattice, Node, NodeKind};
use sysdic::SysDic;
use dic::FstDic;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<'a> {
    start: usize,
    surface: &'a str,
    contents: &'a str,
}

impl<'a> Token<'a> {
    fn new(node: Node<'a>) -> Self {
        let Node { start, kind } = node;
        let (surface, contents) = match kind {
            NodeKind::BOS | NodeKind::EOS => unreachable!(),
            NodeKind::Known(morph) => (morph.surface, morph.contents),
            NodeKind::Unknown(surface, entry) => (surface, entry.contents),
        };

        Token {
            start: start,
            surface: surface,
            contents: contents,
        }
    }

    pub fn surface(&self) -> &str {
        self.surface
    }

    pub fn start(&self) -> usize {
        self.start
    }

    pub fn end(&self) -> usize {
        self.start + self.surface().len()
    }

    pub fn features(&self) -> FeatureIter {
        FeatureIter(self.contents.split(','))
    }
}

impl<'a> fmt::Display for Token<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}\t{}", self.surface, self.contents)
    }
}

pub struct FeatureIter<'a>(Split<'a, char>);

impl<'a> Iterator for FeatureIter<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next()
    }
}

pub struct Tokenizer<'a> {
    sysdic: SysDic,
    udic: Option<FstDic<&'a [u8]>>,
}

impl<'a> Tokenizer<'a> {
    pub fn new(sysdic: SysDic) -> Self {
        Tokenizer { sysdic: sysdic, udic: None }
    }

    pub fn with_udic<'b>(self, udic: FstDic<&'b [u8]>) -> Tokenizer<'b> {
        Tokenizer { sysdic: self.sysdic, udic: Some(udic) }
    }

    pub fn tokenize(&'a self, input: &'a str) -> Vec<Token<'a>> {
        let la = Lattice::build(input, &self.sysdic, self.udic.as_ref());
        la.into_output().into_iter().map(|node| Token::new(node)).collect()
    }
}