igo/
tagger.rs

1use std::io;
2use std::path::Path;
3use std::rc::Rc;
4
5use log::debug;
6
7use crate::dictionary::{self, Matrix, Unknown, ViterbiNode, WordDic};
8use crate::morpheme::Morpheme;
9use crate::{Utf16String, Utf16Str};
10use crate::util::DirLike;
11
12type ViterbiNodeList = Vec<Rc<ViterbiNode>>;
13
14fn bos_nodes() -> ViterbiNodeList {
15    vec![Rc::new(ViterbiNode::make_boseos())]
16}
17
18fn empty_vec() -> ViterbiNodeList {
19    Vec::new()
20}
21
22/// 形態素解析を行う
23pub struct Tagger {
24    wdc: WordDic,
25    unk: Unknown,
26    mtx: Matrix,
27}
28
29impl Tagger {
30    /// バイナリ辞書を読み込んで、形態素解析器のインスタンスを作成する
31    /// # Arguments
32    /// * `data_dir` - バイナリ辞書があるディレクトリ
33    pub fn new(data_dir: &Path) -> io::Result<Tagger> {
34        let mut dir_like = data_dir;
35        Ok(Tagger {
36            wdc: WordDic::new(&mut dir_like)?,
37            unk: Unknown::new(&mut dir_like)?,
38            mtx: Matrix::new(&mut dir_like)?,
39        })
40    }
41
42    /// zip等にアーカイブしたバイナリ辞書を読み込んで、形態素解析器のインスタンスを作成する
43    ///
44    /// WebAssembly等、ファイルシステムに直接アクセスできない環境向け
45    /// # Arguments
46    /// * `dir` - アーカイブファイルのイメージ
47    pub fn load_from_dir(dir: &mut dyn DirLike) -> io::Result<Tagger> {
48        Ok(Tagger {
49            wdc: WordDic::new(dir)?,
50            unk: Unknown::new(dir)?,
51            mtx: Matrix::new(dir)?,
52        })
53    }
54
55    /// 形態素解析を行う
56    /// # Arguments
57    /// * `text` - 解析対象テキスト
58    pub fn parse<'a, 'b>(&'a self, text: &'b str) -> Vec<Morpheme<'a, 'b>> {
59        let utf16_text: Utf16String = text.encode_utf16().collect::<Vec<_>>();
60        let utf8_offsets = utf8_char_offsets(text, utf16_text.len());
61
62        self.parse_impl(&utf16_text).into_iter().map(|n| {
63            let from = utf8_offsets[n.start];
64            let to = utf8_offsets[n.start + (n.length as usize)];
65
66            Morpheme {
67                surface: &text[from..to],
68                feature: self.wdc.word_data(n.word_id),
69                start: n.start,
70            }
71        }).collect()
72    }
73
74    /// 分かち書きを行う
75    /// # Arguments
76    /// * `text` - 分かち書きされるテキスト
77    pub fn wakati(&self, text: &str) -> Vec<String> {
78        let utf16_text: Utf16String = text.encode_utf16().collect::<Vec<_>>();
79        self.parse_impl(&utf16_text).into_iter().map(|n| {
80            String::from_utf16_lossy(&utf16_text[n.start..n.start + (n.length as usize)])
81        }).collect()
82    }
83
84    fn parse_impl(&self, utf16_text: &Utf16Str) -> Vec<Rc<ViterbiNode>> {
85        let len = utf16_text.len();
86        debug!("utf16_text.len: {}", len);
87        let mut nodes_ary: Vec<ViterbiNodeList> = Vec::with_capacity(len + 1);
88        nodes_ary.push(bos_nodes());
89        for _ in 1..=len {
90            nodes_ary.push(empty_vec());
91        }
92
93        let mut f = MakeLattice::new(self, nodes_ary.into_boxed_slice());
94        for i in 0..len {
95            if !f.nodes_ary[i].is_empty() {
96                f.set(i);
97                self.wdc.search(utf16_text, i, &mut f);      // 単語辞書から形態素を検索
98                self.unk.search(utf16_text, i, &self.wdc, &mut f); // 未知語辞書から形態素を検索
99            }
100        }
101        let nodes_ary: Box<[ViterbiNodeList]> = f.into_inner();
102
103        let mut cur: Rc<ViterbiNode> =
104            self.set_mincost_node(ViterbiNode::make_boseos(), &nodes_ary[len]).prev.unwrap();
105
106        // reverse
107        let mut result: Vec<Rc<ViterbiNode>> = Vec::with_capacity(len / 2);
108        result.push(cur.clone());
109        while cur.prev.is_some() {
110            cur = cur.prev.as_ref().cloned().unwrap();
111            result.push(cur.clone());
112        }
113        result.pop();
114        result.reverse();
115
116        result
117    }
118
119    fn set_mincost_node(&self, mut vn: ViterbiNode, prevs: &ViterbiNodeList) -> ViterbiNode {
120        let mut min_idx = 0;
121        let p = &prevs[0];
122        let mut min_cost: i32 = p.cost + self.mtx.link_cost(p.right_id, vn.left_id);
123
124        for i in 1..prevs.len() {
125            let p = &prevs[i];
126            let cost = p.cost + self.mtx.link_cost(p.right_id, vn.left_id);
127            if cost < min_cost {
128                min_cost = cost;
129                min_idx = i;
130            }
131        }
132
133        vn.cost += min_cost;
134        vn.prev = Some(prevs[min_idx].clone());
135
136        vn
137    }
138}
139
140fn utf8_char_offsets(text: &str, num_chars: usize) -> Box<[usize]> {
141    let mut utf8_offsets: Vec<usize> = Vec::with_capacity(num_chars + 1);
142    let mut offset = 0usize;
143    for c in text.chars() {
144        utf8_offsets.push(offset);
145        if c.len_utf16() == 2 {
146            utf8_offsets.push(offset);
147        }
148        offset += c.len_utf8();
149    }
150    utf8_offsets.push(offset);
151    utf8_offsets.into_boxed_slice()
152}
153
154struct MakeLattice<'a> {
155    tagger: &'a Tagger,
156    nodes_ary: Box<[ViterbiNodeList]>,
157    i: usize,
158    prevs: ViterbiNodeList,
159    empty: bool,
160}
161
162impl<'a> MakeLattice<'a> {
163    fn new(tagger: &Tagger, nodes_ary: Box<[ViterbiNodeList]>) -> MakeLattice {
164        MakeLattice {
165            tagger,
166            nodes_ary,
167            i: 0,
168            prevs: empty_vec(),
169            empty: true,
170        }
171    }
172
173    fn set(&mut self, i: usize) {
174        self.i = i;
175        self.prevs = self.nodes_ary[i].clone();
176        self.nodes_ary[i] = empty_vec();
177        self.empty = true;
178    }
179
180    fn into_inner(self) -> Box<[ViterbiNodeList]> {
181        self.nodes_ary
182    }
183}
184
185impl<'a> dictionary::Callback for MakeLattice<'a> {
186    fn call(&mut self, vn: ViterbiNode) {
187        self.empty = false;
188        let end = self.i + (vn.length as usize);
189
190        if vn.is_space {
191            self.nodes_ary[end].extend(self.prevs.iter().cloned());
192        } else {
193            self.nodes_ary[end].push(
194                Rc::new(self.tagger.set_mincost_node(vn, &self.prevs)));
195        }
196    }
197
198    fn is_empty(&self) -> bool {
199        self.empty
200    }
201}
202
203#[cfg(test)]
204impl Tagger {
205    pub fn unknown(&self) -> &Unknown {
206        &self.unk
207    }
208}