1use std::path::Path;
2use std::io::{BufWriter, Write};
3use std::fs::File;
4use byteorder::{WriteBytesExt, NativeEndian as NE};
5use crate::{Utf16Char, Utf16String};
6use crate::util::OutputUtil;
7use crate::dictionary::build::AppResult;
8use crate::trie::{AutoArray, Allocator, KeyStream};
9use crate::trie::node;
10use crate::trie::shrinktail;
11use log::debug;
12
13
14pub fn build(mut key_list: Vec<String>, filepath: &Path) -> AppResult<()> {
19 key_list.sort();
21 key_list.dedup();
22
23 let mut utf16_key_list = Vec::with_capacity(key_list.len());
25 for key in key_list {
26 utf16_key_list.push(key.encode_utf16().collect::<Vec<_>>())
27 }
28
29 let mut bld = Builder::new(&utf16_key_list);
30 let end = bld.ks_list.len();
31 bld.build_impl(&mut Allocator::new(), 0, end, 0);
32 bld.save(filepath)?;
33
34 Ok(())
35}
36
37struct Builder<'a> {
39 ks_list: Vec<KeyStream<'a>>,
40 base: Vec<i32>,
41 chck: Vec<Utf16Char>,
42 begs: Vec<i32>,
43 lens: Vec<Utf16Char>,
44 tail: Utf16String
45}
46
47impl<'a> Builder<'a> {
48 fn new(key_list: &'a [Utf16String]) -> Builder<'a> {
49 let mut ks_list = Vec::with_capacity(key_list.len());
50 for key in key_list {
51 ks_list.push(KeyStream::new(key, 0))
52 }
53
54 Builder {
55 ks_list,
56 base: Vec::new(),
57 chck: Vec::new(),
58 begs: Vec::new(),
59 lens: Vec::new(),
60 tail: Vec::new()
61 }
62 }
63
64 fn save(self, filepath: &Path) -> AppResult<()> {
68 let (tail, begs, lens) = shrinktail::shrink(self.tail, self.begs, self.lens);
69
70 let mut node_size = self.chck.len();
71
72 while node_size > 0 && self.chck[node_size - 1] == node::chck::VACANT_CODE {
74 node_size -= 1;
75 }
76 node_size += node::chck::CODE_LIMIT as usize; debug!("node_size: {}, begs: {}, tail: {}", node_size, begs.len(), tail.len());
78 let mut writer = BufWriter::new(File::create(filepath)?);
79 writer.write_i32::<NE>(node_size as i32)?;
80 writer.write_i32::<NE>(begs.len() as i32)?;
81 writer.write_i32::<NE>(tail.len() as i32)?;
82
83 for n in begs {
85 writer.write_i32::<NE>(n)?;
86 }
87 for i in 0..node_size {
88 writer.write_i32::<NE>(*self.base.get(i).unwrap_or(&node::base::INIT_VALUE))?;
89 }
90
91 for n in lens {
93 writer.write_u16::<NE>(n)?;
94 }
95 for i in 0..node_size {
96 writer.write_u16::<NE>(*self.chck.get(i).unwrap_or(&node::chck::VACANT_CODE))?;
97 }
98
99 writer.put_string(&tail)?;
100 Ok(writer.flush()?)
101 }
102
103 fn build_impl(&mut self, alloca: &mut Allocator, beg: usize, end: usize, root_idx: usize) {
104 if (end - beg) == 1 {
105 self.insert_tail(beg, root_idx);
107 return;
108 }
109
110 let mut end_list: Vec<i32> = Vec::new();
111 let mut code_list: Utf16String = Vec::new();
112 let mut prev: Utf16Char = node::chck::VACANT_CODE;
113
114 for i in beg..end {
116 let cur = self.ks_list[i].read();
117 if prev != cur {
118 prev = cur;
119 code_list.push(cur);
120 end_list.push(i as i32);
121 }
122 }
123 end_list.push(end as i32);
124
125 let x = alloca.x_check(&code_list);
127 for i in 0..code_list.len() {
128 let x_node = self.set_node(code_list[i], root_idx, x);
129 self.build_impl(alloca, end_list[i] as usize, end_list[i + 1] as usize, x_node);
130 }
131 }
132
133 fn set_node(&mut self, code: Utf16Char, prev: usize, x_node: i32) -> usize {
134 let next = x_node as usize + code as usize;
135 self.base.set_auto(prev, x_node, node::base::INIT_VALUE);
136 self.chck.set_auto(next, code, node::chck::VACANT_CODE);
137 next
138 }
139
140 fn insert_tail(&mut self, beg: usize, node: usize) {
141 let rest = self.ks_list[beg].rest();
142
143 self.base.set_auto(node, node::base::ID(self.begs.len() as i32), node::base::INIT_VALUE);
144
145 self.begs.push(self.tail.len() as i32);
146 self.tail.extend_from_slice(rest);
147 self.lens.push(rest.len() as Utf16Char);
148 }
149}