igo/dictionary/
worddic.rs

1use std::io::{self, BufReader};
2use crate::trie::Searcher;
3use crate::dictionary;
4use crate::dictionary::ViterbiNode;
5use crate::util::*;
6use crate::{Utf16Char, Utf16Str};
7use log::debug;
8
9
10pub struct WordDic {
11    trie: Searcher,
12    data: String,
13    indices: Box<[i32]>,
14
15    /// costs[単語ID] = 単語のコスト
16    costs: Box<[i16]>,
17    /// left_ids[単語ID] = 単語の左文脈ID
18    left_ids: Box<[i16]>,
19    /// right_ids[単語ID] = 単語の右文脈ID
20    right_ids: Box<[i16]>,
21    /// data_offsets[単語ID] = 単語の素性データの開始位置
22    data_offsets: Box<[i32]>
23}
24
25impl WordDic {
26    pub fn new(dir: &mut dyn DirLike) -> io::Result<WordDic> {
27        let word2id_path = "word2id";
28        let dat_path ="word.dat";
29        let idx_path = "word.ary.idx";
30        let inf_path = "word.inf";
31
32        let inf_size = dir.file_size(inf_path)?;
33        let mut reader = BufReader::new(dir.open(inf_path)?);
34        let word_count = (inf_size / (4 + 2 + 2 + 2)) as usize;
35        debug!("word_count: {}", word_count);
36
37        let word_data = read_all_as_chars(dir, dat_path)?;
38        let data_offsets = reader.get_int_array(word_count)?;
39        let (word_data, data_offsets) = convert2utf8_data(&word_data, &data_offsets);
40
41        Ok(WordDic {
42            trie: Searcher::new(dir.open(word2id_path)?)?,
43            data: word_data,
44            indices: read_all_as_int_array(dir, idx_path)?,
45
46            data_offsets,
47            left_ids: reader.get_short_array(word_count)?,
48            right_ids: reader.get_short_array(word_count)?,
49            costs: reader.get_short_array(word_count)?
50        })
51    }
52
53
54    pub fn search(&self, text: &Utf16Str, start: usize, callback: &mut dyn dictionary::Callback) {
55        self.trie.each_common_prefix(text, start, |start: usize, offset: i32, trie_id: i32| {
56            /*
57             * common-prefix検索でキーが見つかった場合に呼び出されるクロージャー
58             * each_common_prefix()で該当するキーの部分文字列が見つかった都度に呼び出される
59             *
60             * @param start  入力テキストの検索開始位置
61             * @param offset 一致した部分文字列の終端位置
62             * @param trie_id 一致した部分文字列のID
63             */
64            let trie_id = trie_id as usize;
65            let end: i32 = self.indices[trie_id + 1];
66
67            for i in self.indices[trie_id]..end {
68                let idx = i as usize;
69                callback.call(ViterbiNode {
70                    word_id: i,
71                    start,
72                    length: offset as i16,
73                    cost: i32::from(self.costs[idx]),
74                    left_id: self.left_ids[idx],
75                    right_id: self.right_ids[idx],
76                    is_space: false,
77                    prev: None
78                });
79            }
80        });
81    }
82
83    pub fn search_from_trie_id(&self, trie_id: i32, start: usize, word_length: usize,
84                               is_space: bool, callback: &mut dyn dictionary::Callback) {
85        let trie_id = trie_id as usize;
86        let end = self.indices[trie_id + 1];
87        for i in self.indices[trie_id]..end {
88            let idx = i as usize;
89            callback.call(ViterbiNode {
90                word_id: i,
91                start,
92                length: word_length as i16,
93                cost: i32::from(self.costs[idx]),
94                left_id: self.left_ids[idx],
95                right_id: self.right_ids[idx],
96                is_space,
97                prev: None
98            });
99        }
100    }
101
102    pub fn word_data(&self, word_id: i32) -> &str {
103        let word_id = word_id as usize;
104        &self.data[
105            (self.data_offsets[word_id] as usize) .. (self.data_offsets[word_id + 1] as usize)]
106    }
107}
108
109// word_data()用に、予めString型へ変換しておく
110fn convert2utf8_data(utf16_str: &[Utf16Char], offsets: &[i32]) -> (String, Box<[i32]>) {
111    let mut buf = String::with_capacity(utf16_str.len() * 3);
112    let mut new_offset = vec![0i32; offsets.len()];
113
114    for word_id in 0..(offsets.len() - 1) {
115        let offset = offsets[word_id] as usize;
116        let next_offset = offsets[word_id + 1] as usize;
117        let word_data = String::from_utf16_lossy(&utf16_str[offset..next_offset]);
118        buf.push_str(&word_data);
119        new_offset[word_id + 1] = buf.len() as i32;
120    }
121    debug!("buf size: {} / {}", buf.len(), buf.capacity());
122
123    (buf, new_offset.into_boxed_slice())
124}