yoin_core/tokenizer/
mod.rs1use std::iter::Iterator;
2use std::str::Split;
3use std::fmt;
4
5mod lattice;
6use self::lattice::{Lattice, Node, NodeKind};
7use sysdic::SysDic;
8use dic::FstDic;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct Token<'a> {
12 start: usize,
13 surface: &'a str,
14 contents: &'a str,
15}
16
17impl<'a> Token<'a> {
18 fn new(node: Node<'a>) -> Self {
19 let Node { start, kind } = node;
20 let (surface, contents) = match kind {
21 NodeKind::BOS | NodeKind::EOS => unreachable!(),
22 NodeKind::Known(morph) => (morph.surface, morph.contents),
23 NodeKind::Unknown(surface, entry) => (surface, entry.contents),
24 };
25
26 Token {
27 start: start,
28 surface: surface,
29 contents: contents,
30 }
31 }
32
33 pub fn surface(&self) -> &str {
34 self.surface
35 }
36
37 pub fn start(&self) -> usize {
38 self.start
39 }
40
41 pub fn end(&self) -> usize {
42 self.start + self.surface().len()
43 }
44
45 pub fn features(&self) -> FeatureIter {
46 FeatureIter(self.contents.split(','))
47 }
48}
49
50impl<'a> fmt::Display for Token<'a> {
51 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
52 write!(f, "{}\t{}", self.surface, self.contents)
53 }
54}
55
56pub struct FeatureIter<'a>(Split<'a, char>);
57
58impl<'a> Iterator for FeatureIter<'a> {
59 type Item = &'a str;
60
61 fn next(&mut self) -> Option<Self::Item> {
62 self.0.next()
63 }
64}
65
66pub struct Tokenizer<'a> {
67 sysdic: SysDic,
68 udic: Option<FstDic<&'a [u8]>>,
69}
70
71impl<'a> Tokenizer<'a> {
72 pub fn new(sysdic: SysDic) -> Self {
73 Tokenizer { sysdic: sysdic, udic: None }
74 }
75
76 pub fn with_udic<'b>(self, udic: FstDic<&'b [u8]>) -> Tokenizer<'b> {
77 Tokenizer { sysdic: self.sysdic, udic: Some(udic) }
78 }
79
80 pub fn tokenize(&'a self, input: &'a str) -> Vec<Token<'a>> {
81 let la = Lattice::build(input, &self.sysdic, self.udic.as_ref());
82 la.into_output().into_iter().map(|node| Token::new(node)).collect()
83 }
84}