1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
use std::iter::Iterator; use std::str::Split; use std::fmt; mod lattice; use self::lattice::{Lattice, Node, NodeKind}; use sysdic::SysDic; use dic::FstDic; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token<'a> { start: usize, surface: &'a str, contents: &'a str, } impl<'a> Token<'a> { fn new(node: Node<'a>) -> Self { let Node { start, kind } = node; let (surface, contents) = match kind { NodeKind::BOS | NodeKind::EOS => unreachable!(), NodeKind::Known(morph) => (morph.surface, morph.contents), NodeKind::Unknown(surface, entry) => (surface, entry.contents), }; Token { start: start, surface: surface, contents: contents, } } pub fn surface(&self) -> &str { self.surface } pub fn start(&self) -> usize { self.start } pub fn end(&self) -> usize { self.start + self.surface().len() } pub fn features(&self) -> FeatureIter { FeatureIter(self.contents.split(',')) } } impl<'a> fmt::Display for Token<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}\t{}", self.surface, self.contents) } } pub struct FeatureIter<'a>(Split<'a, char>); impl<'a> Iterator for FeatureIter<'a> { type Item = &'a str; fn next(&mut self) -> Option<Self::Item> { self.0.next() } } pub struct Tokenizer<'a> { sysdic: SysDic, udic: Option<FstDic<&'a [u8]>>, } impl<'a> Tokenizer<'a> { pub fn new(sysdic: SysDic) -> Self { Tokenizer { sysdic: sysdic, udic: None } } pub fn with_udic<'b>(self, udic: FstDic<&'b [u8]>) -> Tokenizer<'b> { Tokenizer { sysdic: self.sysdic, udic: Some(udic) } } pub fn tokenize(&'a self, input: &'a str) -> Vec<Token<'a>> { let la = Lattice::build(input, &self.sysdic, self.udic.as_ref()); la.into_output().into_iter().map(|node| Token::new(node)).collect() } }