1use std::io::{self, Write};
2use std::collections::HashMap;
3
4use byteorder::{ByteOrder, WriteBytesExt, NativeEndian};
5
6pub type CategoryId = u8;
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub struct Category {
10 pub invoke: bool,
11 pub group: bool,
12 pub length: u8,
13}
14
15pub trait CharCategorize {
16 fn categorize(&self, ch: char) -> Category;
17 fn category_id(&self, ch: char) -> CategoryId;
18}
19
20pub struct CharTable {
21 pub default_id: CategoryId,
22 pub categories: Vec<Category>,
23 pub table: [CategoryId; ::std::u16::MAX as usize],
24}
25
26impl CharTable {
27 pub fn new(default_id: CategoryId, categories: Vec<Category>) -> CharTable {
28 CharTable {
29 default_id: default_id,
30 categories: categories,
31 table: [default_id; ::std::u16::MAX as usize],
32 }
33 }
34
35 pub fn set(&mut self, ch: usize, id: CategoryId) {
36 if ch < self.table.len() {
37 self.table[ch] = id;
38 }
39 }
40}
41
42impl CharCategorize for CharTable {
43 fn categorize(&self, ch: char) -> Category {
44 let id = self.category_id(ch);
45 self.categories[id as usize]
46 }
47
48 fn category_id(&self, ch: char) -> CategoryId {
49 let ch = ch as u32;
50 if ch < ::std::u16::MAX as u32 {
51 self.table[ch as usize]
52 } else {
53 self.default_id
54 }
55 }
56}
57
58impl CharTable {
59 pub fn encode<W: Write>(&self, mut w: W) -> io::Result<()> {
60 let n = self.categories.len() as u8;
61 w.write_u8(n)?;
62 w.write_u8(self.default_id)?;
63 for c in self.categories.iter() {
64 w.write_u8(c.invoke as u8)?;
65 }
66 for c in self.categories.iter() {
67 w.write_u8(c.group as u8)?;
68 }
69 for c in self.categories.iter() {
70 w.write_u8(c.length as u8)?;
71 }
72 for &b in self.table.iter() {
73 w.write_u8(b)?;
74 }
75 Ok(())
76 }
77
78 pub fn encode_native<W: Write>(&self, w: W) -> io::Result<()> {
79 self.encode::<W>(w)
80 }
81}
82
83pub struct CompiledCharTable<'a> {
84 pub n_categories: u8,
85 pub default_id: u8,
86 pub invokes: &'a [u8],
87 pub groups: &'a [u8],
88 pub lengths: &'a [u8],
89 pub table: &'a [CategoryId],
90}
91
92impl<'a> CharCategorize for CompiledCharTable<'a> {
93 fn categorize(&self, ch: char) -> Category {
94 let id = self.category_id(ch) as usize;
95 Category {
96 invoke: self.invokes[id] != 0,
97 group: self.groups[id] != 0,
98 length: self.lengths[id],
99 }
100 }
101
102 fn category_id(&self, ch: char) -> CategoryId {
103 let ch = ch as u32;
104 if ch < ::std::u16::MAX as u32 {
105 self.table[ch as usize]
106 } else {
107 self.default_id
108 }
109 }
110}
111
112impl<'a> CompiledCharTable<'a> {
113 pub unsafe fn decode(bs: &'a [u8]) -> Self {
114 let ptr = bs.as_ptr() as *const u8;
115 let n = *ptr;
116 let default_id = *ptr.offset(1);
117 let ptr = ptr.offset(2);
118 let invokes = ::std::slice::from_raw_parts(ptr, n as usize);
119 let ptr = ptr.offset(n as isize);
120 let groups = ::std::slice::from_raw_parts(ptr, n as usize);
121 let ptr = ptr.offset(n as isize);
122 let lengths = ::std::slice::from_raw_parts(ptr, n as usize);
123 let ptr = ptr.offset(n as isize);
124 let table = ::std::slice::from_raw_parts(ptr, ::std::u16::MAX as usize);
125 CompiledCharTable {
126 n_categories: n,
127 default_id: default_id,
128 invokes: invokes,
129 groups: groups,
130 lengths: lengths,
131 table: table,
132 }
133 }
134}
135
136#[test]
137fn test_encode_decode() {
138 let mut table = CharTable {
139 default_id: 0,
140 categories: vec![Category {
141 invoke: true,
142 group: false,
143 length: 0,
144 },
145 Category {
146 invoke: false,
147 group: true,
148 length: 1,
149 },
150 Category {
151 invoke: true,
152 group: false,
153 length: 2,
154 }],
155 table: [0; ::std::u16::MAX as usize],
156 };
157 table.table['あ' as usize] = 1;
158 table.table['a' as usize] = 2;
159
160 let mut buf = Vec::new();
161 table.encode(&mut buf).unwrap();
162
163 let compiled = unsafe { CompiledCharTable::decode(&buf) };
164
165 let tests = vec!['0', 'あ', 'a'];
166
167 for ch in tests {
168 let category = compiled.categorize(ch);
169 assert_eq!(category, table.categorize(ch));
170 }
171}
172
173#[derive(Debug, Clone, PartialEq)]
174pub struct Entry<'a> {
175 pub left_id: u16,
176 pub right_id: u16,
177 pub weight: i16,
178 pub contents: &'a str,
179}
180
181impl<'a> Entry<'a> {
182 pub fn encode<W: Write, O: ByteOrder>(&self, mut w: W) -> io::Result<()> {
183 w.write_u16::<O>(self.left_id)?;
184 w.write_u16::<O>(self.right_id)?;
185 w.write_i16::<O>(self.weight)?;
186 w.write_u32::<O>(self.contents.len() as u32)?;
187 for &b in self.contents.as_bytes() {
188 w.write_u8(b)?;
189 }
190 Ok(())
191 }
192
193 pub fn encode_native<W: Write>(&self, w: W) -> io::Result<()> {
194 self.encode::<_, NativeEndian>(w)
195 }
196
197 pub unsafe fn decode(bs: &'a [u8]) -> Self {
198 let ptr = bs.as_ptr() as *const u16;
199 let left_id = *ptr;
200 let right_id = *ptr.offset(1);
201 let ptr = ptr.offset(2) as *const i16;
202 let weight = *ptr;
203 let ptr = ptr.offset(1) as *const u32;
204 let len = *ptr;
205 let ptr = ptr.offset(1) as *const u8;
206 let buf = ::std::slice::from_raw_parts(ptr, len as usize);
207 let contents = ::std::str::from_utf8_unchecked(buf);
208 Entry {
209 left_id: left_id,
210 right_id: right_id,
211 weight: weight,
212 contents: contents,
213 }
214 }
215}
216
217#[test]
218fn test_entry_encode() {
219 let e = Entry {
220 left_id: 0,
221 right_id: 1,
222 weight: -1,
223 contents: "てすと",
224 };
225 let mut buf = Vec::new();
226 e.encode_native(&mut buf).unwrap();
227 let actual = unsafe { Entry::decode(&buf) };
228 assert_eq!(actual, e);
229}
230
231pub trait UnknownDic: CharCategorize {
232 fn fetch_entries<'a>(&'a self, cate: CategoryId) -> Vec<Entry<'a>>;
233}
234
235pub struct UnkDic {
236 pub indices: Vec<u32>, pub counts: Vec<u32>, pub entry_offsets: Vec<u32>,
239 pub entries: Vec<u8>,
240 pub categories: CharTable,
241}
242
243impl CharCategorize for UnkDic {
244 fn categorize(&self, ch: char) -> Category {
245 self.categories.categorize(ch)
246 }
247
248 fn category_id(&self, ch: char) -> CategoryId {
249 self.categories.category_id(ch)
250 }
251}
252
253impl UnknownDic for UnkDic {
254 fn fetch_entries<'a>(&'a self, cate: CategoryId) -> Vec<Entry<'a>> {
255 let count = self.counts[cate as usize] as usize;
256 let index = self.indices[cate as usize] as usize;
257 let offsets = &self.entry_offsets[index..index + count];
258 let mut results = Vec::with_capacity(count);
259 for &offset in offsets {
260 results.push(unsafe { Entry::decode(&self.entries[offset as usize..]) });
261 }
262 results
263 }
264}
265
266impl UnkDic {
267 pub fn build<'a>(entries: HashMap<CategoryId, Vec<Entry<'a>>>, char_table: CharTable) -> Self {
268 let n_cates = entries.len();
269 let mut indices = vec![0; n_cates];
270 let mut counts = vec![0; n_cates];
271 let mut offsets = Vec::new();
272 let mut entry_buf = Vec::new();
273 let mut index = 0;
274 for (id, entries) in entries {
275 indices[id as usize] = index;
276 counts[id as usize] = entries.len() as u32;
277 for entry in entries {
278 let offset = entry_buf.len() as u32;
279 offsets.push(offset);
280 index += 1;
281 entry.encode_native(&mut entry_buf).unwrap();
282 }
283 }
284 UnkDic {
285 indices: indices,
286 counts: counts,
287 entry_offsets: offsets,
288 entries: entry_buf,
289 categories: char_table,
290 }
291 }
292
293 pub fn encode<W: Write, O: ByteOrder>(&self, mut w: W) -> io::Result<()> {
294 w.write_u32::<O>(self.indices.len() as u32)?;
295 for i in &self.indices {
296 w.write_u32::<O>(*i)?;
297 }
298 w.write_u32::<O>(self.counts.len() as u32)?;
299 for i in &self.counts {
300 w.write_u32::<O>(*i)?;
301 }
302 w.write_u32::<O>(self.entry_offsets.len() as u32)?;
303 for i in &self.entry_offsets {
304 w.write_u32::<O>(*i)?;
305 }
306 w.write_u32::<O>(self.entries.len() as u32)?;
307 for b in &self.entries {
308 w.write_u8(*b)?;
309 }
310 self.categories.encode(w)
311 }
312
313 pub fn encode_native<W: Write>(&self, w: W) -> io::Result<()> {
314 self.encode::<_, NativeEndian>(w)
315 }
316}
317
318#[test]
319fn test_unk_dic() {
320 let stub_char_table = CharTable::new(0, Vec::new());
321 let mut entries = HashMap::new();
322 let es = vec!["a", "b"]
323 .into_iter()
324 .map(|s| {
325 Entry {
326 left_id: 0,
327 right_id: 1,
328 weight: -1,
329 contents: s,
330 }
331 })
332 .collect::<Vec<_>>();
333 entries.insert(0, es.clone());
334 entries.insert(1, es.clone());
335 let dic = UnkDic::build(entries, stub_char_table);
336 assert_eq!(dic.fetch_entries(0), es);
337 assert_eq!(dic.fetch_entries(1), es);
338}
339
340pub struct CompiledUnkDic<'a> {
341 indices: &'a [u32],
342 counts: &'a [u32],
343 entry_offsets: &'a [u32],
344 entries: &'a [u8],
345 categories: CompiledCharTable<'a>,
346}
347
348impl<'a> CharCategorize for CompiledUnkDic<'a> {
349 fn categorize(&self, ch: char) -> Category {
350 self.categories.categorize(ch)
351 }
352
353 fn category_id(&self, ch: char) -> CategoryId {
354 self.categories.category_id(ch)
355 }
356}
357
358impl<'a> UnknownDic for CompiledUnkDic<'a> {
359 fn fetch_entries<'b>(&'b self, cate: CategoryId) -> Vec<Entry<'b>> {
360 let count = self.counts[cate as usize] as usize;
361 let index = self.indices[cate as usize] as usize;
362 let offsets = &self.entry_offsets[index..index + count];
363 let mut results = Vec::with_capacity(count);
364 for &offset in offsets {
365 let e = unsafe { Entry::decode(&self.entries[offset as usize..]) };
366 results.push(e);
367 }
368 results
369 }
370}
371
372impl<'a> CompiledUnkDic<'a> {
373 pub unsafe fn decode(bs: &'a [u8]) -> Self {
374 let ptr = bs.as_ptr() as *const u32;
375 let ind_len = *ptr;
376 let ptr = ptr.offset(1) as *const u32;
377 let indices = ::std::slice::from_raw_parts(ptr, ind_len as usize);
378 let ptr = ptr.offset(ind_len as isize);
379 let counts_len = *ptr;
380 let ptr = ptr.offset(1) as *const u32;
381 let counts = ::std::slice::from_raw_parts(ptr, counts_len as usize);
382 let ptr = ptr.offset(counts_len as isize);
383 let entry_offsets_len = *ptr;
384 let ptr = ptr.offset(1) as *const u32;
385 let entry_offsets = ::std::slice::from_raw_parts(ptr, entry_offsets_len as usize);
386 let ptr = ptr.offset(entry_offsets_len as isize);
387 let entries_len = *ptr;
388 let ptr = ptr.offset(1) as *const u8;
389 let entries = ::std::slice::from_raw_parts(ptr, entries_len as usize);
390 let ptr = ptr.offset(entries_len as isize);
391 let ptr_diff = ptr as usize - bs.as_ptr() as usize;
392 let bs = &bs[ptr_diff..];
393 let categories = CompiledCharTable::decode(bs);
394
395 CompiledUnkDic {
396 indices: indices,
397 counts: counts,
398 entry_offsets: entry_offsets,
399 entries: entries,
400 categories: categories,
401 }
402 }
403}
404
405#[test]
406fn test_unk_dic_encode() {
407 let stub_char_table = CharTable::new(0, Vec::new());
408 let mut entries = HashMap::new();
409 let es = vec!["a", "b"]
410 .into_iter()
411 .map(|s| {
412 Entry {
413 left_id: 0,
414 right_id: 1,
415 weight: -1,
416 contents: s,
417 }
418 })
419 .collect::<Vec<_>>();
420 entries.insert(0, es.clone());
421 entries.insert(1, es.clone());
422 entries.insert(2, es.clone());
423 let dic = UnkDic::build(entries, stub_char_table);
424 let mut buf = Vec::new();
425 dic.encode_native(&mut buf).unwrap();
426 let compiled = unsafe { CompiledUnkDic::decode(&buf) };
427 assert_eq!(dic.fetch_entries(0), compiled.fetch_entries(0));
428 assert_eq!(dic.fetch_entries(1), compiled.fetch_entries(1));
429 assert_eq!(dic.fetch_entries(2), compiled.fetch_entries(2));
430}