1use std::io;
2use std::path::Path;
3use std::rc::Rc;
4
5use log::debug;
6
7use crate::dictionary::{self, Matrix, Unknown, ViterbiNode, WordDic};
8use crate::morpheme::Morpheme;
9use crate::{Utf16String, Utf16Str};
10use crate::util::DirLike;
11
12type ViterbiNodeList = Vec<Rc<ViterbiNode>>;
13
14fn bos_nodes() -> ViterbiNodeList {
15 vec![Rc::new(ViterbiNode::make_boseos())]
16}
17
18fn empty_vec() -> ViterbiNodeList {
19 Vec::new()
20}
21
22pub struct Tagger {
24 wdc: WordDic,
25 unk: Unknown,
26 mtx: Matrix,
27}
28
29impl Tagger {
30 pub fn new(data_dir: &Path) -> io::Result<Tagger> {
34 let mut dir_like = data_dir;
35 Ok(Tagger {
36 wdc: WordDic::new(&mut dir_like)?,
37 unk: Unknown::new(&mut dir_like)?,
38 mtx: Matrix::new(&mut dir_like)?,
39 })
40 }
41
42 pub fn load_from_dir(dir: &mut dyn DirLike) -> io::Result<Tagger> {
48 Ok(Tagger {
49 wdc: WordDic::new(dir)?,
50 unk: Unknown::new(dir)?,
51 mtx: Matrix::new(dir)?,
52 })
53 }
54
55 pub fn parse<'a, 'b>(&'a self, text: &'b str) -> Vec<Morpheme<'a, 'b>> {
59 let utf16_text: Utf16String = text.encode_utf16().collect::<Vec<_>>();
60 let utf8_offsets = utf8_char_offsets(text, utf16_text.len());
61
62 self.parse_impl(&utf16_text).into_iter().map(|n| {
63 let from = utf8_offsets[n.start];
64 let to = utf8_offsets[n.start + (n.length as usize)];
65
66 Morpheme {
67 surface: &text[from..to],
68 feature: self.wdc.word_data(n.word_id),
69 start: n.start,
70 }
71 }).collect()
72 }
73
74 pub fn wakati(&self, text: &str) -> Vec<String> {
78 let utf16_text: Utf16String = text.encode_utf16().collect::<Vec<_>>();
79 self.parse_impl(&utf16_text).into_iter().map(|n| {
80 String::from_utf16_lossy(&utf16_text[n.start..n.start + (n.length as usize)])
81 }).collect()
82 }
83
84 fn parse_impl(&self, utf16_text: &Utf16Str) -> Vec<Rc<ViterbiNode>> {
85 let len = utf16_text.len();
86 debug!("utf16_text.len: {}", len);
87 let mut nodes_ary: Vec<ViterbiNodeList> = Vec::with_capacity(len + 1);
88 nodes_ary.push(bos_nodes());
89 for _ in 1..=len {
90 nodes_ary.push(empty_vec());
91 }
92
93 let mut f = MakeLattice::new(self, nodes_ary.into_boxed_slice());
94 for i in 0..len {
95 if !f.nodes_ary[i].is_empty() {
96 f.set(i);
97 self.wdc.search(utf16_text, i, &mut f); self.unk.search(utf16_text, i, &self.wdc, &mut f); }
100 }
101 let nodes_ary: Box<[ViterbiNodeList]> = f.into_inner();
102
103 let mut cur: Rc<ViterbiNode> =
104 self.set_mincost_node(ViterbiNode::make_boseos(), &nodes_ary[len]).prev.unwrap();
105
106 let mut result: Vec<Rc<ViterbiNode>> = Vec::with_capacity(len / 2);
108 result.push(cur.clone());
109 while cur.prev.is_some() {
110 cur = cur.prev.as_ref().cloned().unwrap();
111 result.push(cur.clone());
112 }
113 result.pop();
114 result.reverse();
115
116 result
117 }
118
119 fn set_mincost_node(&self, mut vn: ViterbiNode, prevs: &ViterbiNodeList) -> ViterbiNode {
120 let mut min_idx = 0;
121 let p = &prevs[0];
122 let mut min_cost: i32 = p.cost + self.mtx.link_cost(p.right_id, vn.left_id);
123
124 for i in 1..prevs.len() {
125 let p = &prevs[i];
126 let cost = p.cost + self.mtx.link_cost(p.right_id, vn.left_id);
127 if cost < min_cost {
128 min_cost = cost;
129 min_idx = i;
130 }
131 }
132
133 vn.cost += min_cost;
134 vn.prev = Some(prevs[min_idx].clone());
135
136 vn
137 }
138}
139
140fn utf8_char_offsets(text: &str, num_chars: usize) -> Box<[usize]> {
141 let mut utf8_offsets: Vec<usize> = Vec::with_capacity(num_chars + 1);
142 let mut offset = 0usize;
143 for c in text.chars() {
144 utf8_offsets.push(offset);
145 if c.len_utf16() == 2 {
146 utf8_offsets.push(offset);
147 }
148 offset += c.len_utf8();
149 }
150 utf8_offsets.push(offset);
151 utf8_offsets.into_boxed_slice()
152}
153
154struct MakeLattice<'a> {
155 tagger: &'a Tagger,
156 nodes_ary: Box<[ViterbiNodeList]>,
157 i: usize,
158 prevs: ViterbiNodeList,
159 empty: bool,
160}
161
162impl<'a> MakeLattice<'a> {
163 fn new(tagger: &Tagger, nodes_ary: Box<[ViterbiNodeList]>) -> MakeLattice {
164 MakeLattice {
165 tagger,
166 nodes_ary,
167 i: 0,
168 prevs: empty_vec(),
169 empty: true,
170 }
171 }
172
173 fn set(&mut self, i: usize) {
174 self.i = i;
175 self.prevs = self.nodes_ary[i].clone();
176 self.nodes_ary[i] = empty_vec();
177 self.empty = true;
178 }
179
180 fn into_inner(self) -> Box<[ViterbiNodeList]> {
181 self.nodes_ary
182 }
183}
184
185impl<'a> dictionary::Callback for MakeLattice<'a> {
186 fn call(&mut self, vn: ViterbiNode) {
187 self.empty = false;
188 let end = self.i + (vn.length as usize);
189
190 if vn.is_space {
191 self.nodes_ary[end].extend(self.prevs.iter().cloned());
192 } else {
193 self.nodes_ary[end].push(
194 Rc::new(self.tagger.set_mincost_node(vn, &self.prevs)));
195 }
196 }
197
198 fn is_empty(&self) -> bool {
199 self.empty
200 }
201}
202
203#[cfg(test)]
204impl Tagger {
205 pub fn unknown(&self) -> &Unknown {
206 &self.unk
207 }
208}