igo/trie/
builder.rs

1use std::path::Path;
2use std::io::{BufWriter, Write};
3use std::fs::File;
4use byteorder::{WriteBytesExt, NativeEndian as NE};
5use crate::{Utf16Char, Utf16String};
6use crate::util::OutputUtil;
7use crate::dictionary::build::AppResult;
8use crate::trie::{AutoArray, Allocator, KeyStream};
9use crate::trie::node;
10use crate::trie::shrinktail;
11use log::debug;
12
13
14/// キー文字列のリストから、DoubleArrayを構築し、ファイルに保存する
15/// # Arguments
16/// * `key_list`  - DoubleArrayのキーとなる文字列のリスト. 破壊的に更新される
17/// * `filepath`  - DoubleArrayを保存するファイルのパス
18pub fn build(mut key_list: Vec<String>, filepath: &Path) -> AppResult<()> {
19    // ソート and ユニーク
20    key_list.sort();
21    key_list.dedup();
22
23    // UTF-16に変換する
24    let mut utf16_key_list = Vec::with_capacity(key_list.len());
25    for key in key_list {
26        utf16_key_list.push(key.encode_utf16().collect::<Vec<_>>())
27    }
28
29    let mut bld = Builder::new(&utf16_key_list);
30    let end = bld.ks_list.len();
31    bld.build_impl(&mut Allocator::new(), 0, end, 0);
32    bld.save(filepath)?;
33
34    Ok(())
35}
36
37/// DoubleArrayの構築を行う
38struct Builder<'a> {
39    ks_list: Vec<KeyStream<'a>>,
40    base: Vec<i32>,
41    chck: Vec<Utf16Char>,
42    begs: Vec<i32>,
43    lens: Vec<Utf16Char>,
44    tail: Utf16String
45}
46
47impl<'a> Builder<'a> {
48    fn new(key_list: &'a [Utf16String]) -> Builder<'a> {
49        let mut ks_list = Vec::with_capacity(key_list.len());
50        for key in key_list {
51            ks_list.push(KeyStream::new(key, 0))
52        }
53
54        Builder {
55            ks_list,
56            base: Vec::new(),
57            chck: Vec::new(),
58            begs: Vec::new(),
59            lens: Vec::new(),
60            tail: Vec::new()
61        }
62    }
63
64    /// 構築したDoubleArrayをファイルに保存する
65    /// # Arguments
66    /// * `filepath`  - DoubleArrayを保存するファイルのパス
67    fn save(self, filepath: &Path) -> AppResult<()> {
68        let (tail, begs, lens) = shrinktail::shrink(self.tail, self.begs, self.lens);
69
70        let mut node_size = self.chck.len();
71
72        // 末尾の未使用部分を取り除く
73        while node_size > 0 && self.chck[node_size - 1] == node::chck::VACANT_CODE {
74            node_size -= 1;
75        }
76        node_size += node::chck::CODE_LIMIT as usize;  // 検索時の範囲外アクセスを防ぐために、余白を設ける
77        debug!("node_size: {}, begs: {}, tail: {}", node_size, begs.len(), tail.len());
78        let mut writer = BufWriter::new(File::create(filepath)?);
79        writer.write_i32::<NE>(node_size as i32)?;
80        writer.write_i32::<NE>(begs.len() as i32)?;
81        writer.write_i32::<NE>(tail.len() as i32)?;
82
83        // 4byte
84        for n in begs {
85            writer.write_i32::<NE>(n)?;
86        }
87        for i in 0..node_size {
88            writer.write_i32::<NE>(*self.base.get(i).unwrap_or(&node::base::INIT_VALUE))?;
89        }
90
91        // 2byte
92        for n in lens {
93            writer.write_u16::<NE>(n)?;
94        }
95        for i in 0..node_size {
96            writer.write_u16::<NE>(*self.chck.get(i).unwrap_or(&node::chck::VACANT_CODE))?;
97        }
98
99        writer.put_string(&tail)?;
100        Ok(writer.flush()?)
101    }
102
103    fn build_impl(&mut self, alloca: &mut Allocator, beg: usize, end: usize, root_idx: usize) {
104        if (end - beg) == 1 {
105            // これ以降は単一の遷移パスしかないので、まとめてTAILに挿入してしまう
106            self.insert_tail(beg, root_idx);
107            return;
108        }
109
110        let mut end_list: Vec<i32> = Vec::new();
111        let mut code_list: Utf16String = Vec::new();
112        let mut prev: Utf16Char = node::chck::VACANT_CODE;
113
114        // root_idxから遷移する文字を集める
115        for i in beg..end {
116            let cur = self.ks_list[i].read();
117            if prev != cur {
118                prev = cur;
119                code_list.push(cur);
120                end_list.push(i as i32);
121            }
122        }
123        end_list.push(end as i32);
124
125        // root_idxから派生(遷移)するノードを設定し、その各ノードに対して再帰的に処理を繰り返す
126        let x = alloca.x_check(&code_list);
127        for i in 0..code_list.len() {
128            let x_node = self.set_node(code_list[i], root_idx, x);
129            self.build_impl(alloca, end_list[i] as usize, end_list[i + 1] as usize, x_node);
130        }
131    }
132
133    fn set_node(&mut self, code: Utf16Char, prev: usize, x_node: i32) -> usize {
134        let next = x_node as usize + code as usize;
135        self.base.set_auto(prev, x_node, node::base::INIT_VALUE);
136        self.chck.set_auto(next, code, node::chck::VACANT_CODE);
137        next
138    }
139
140    fn insert_tail(&mut self, beg: usize, node: usize) {
141        let rest = self.ks_list[beg].rest();
142
143        self.base.set_auto(node, node::base::ID(self.begs.len() as i32), node::base::INIT_VALUE);
144
145        self.begs.push(self.tail.len() as i32);
146        self.tail.extend_from_slice(rest);
147        self.lens.push(rest.len() as Utf16Char);
148    }
149}