yoin_core/dic/
unknown.rs

1use std::io::{self, Write};
2use std::collections::HashMap;
3
4use byteorder::{ByteOrder, WriteBytesExt, NativeEndian};
5
6pub type CategoryId = u8;
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub struct Category {
10    pub invoke: bool,
11    pub group: bool,
12    pub length: u8,
13}
14
15pub trait CharCategorize {
16    fn categorize(&self, ch: char) -> Category;
17    fn category_id(&self, ch: char) -> CategoryId;
18}
19
20pub struct CharTable {
21    pub default_id: CategoryId,
22    pub categories: Vec<Category>,
23    pub table: [CategoryId; ::std::u16::MAX as usize],
24}
25
26impl CharTable {
27    pub fn new(default_id: CategoryId, categories: Vec<Category>) -> CharTable {
28        CharTable {
29            default_id: default_id,
30            categories: categories,
31            table: [default_id; ::std::u16::MAX as usize],
32        }
33    }
34
35    pub fn set(&mut self, ch: usize, id: CategoryId) {
36        if ch < self.table.len() {
37            self.table[ch] = id;
38        }
39    }
40}
41
42impl CharCategorize for CharTable {
43    fn categorize(&self, ch: char) -> Category {
44        let id = self.category_id(ch);
45        self.categories[id as usize]
46    }
47
48    fn category_id(&self, ch: char) -> CategoryId {
49        let ch = ch as u32;
50        if ch < ::std::u16::MAX as u32 {
51            self.table[ch as usize]
52        } else {
53            self.default_id
54        }
55    }
56}
57
58impl CharTable {
59    pub fn encode<W: Write>(&self, mut w: W) -> io::Result<()> {
60        let n = self.categories.len() as u8;
61        w.write_u8(n)?;
62        w.write_u8(self.default_id)?;
63        for c in self.categories.iter() {
64            w.write_u8(c.invoke as u8)?;
65        }
66        for c in self.categories.iter() {
67            w.write_u8(c.group as u8)?;
68        }
69        for c in self.categories.iter() {
70            w.write_u8(c.length as u8)?;
71        }
72        for &b in self.table.iter() {
73            w.write_u8(b)?;
74        }
75        Ok(())
76    }
77
78    pub fn encode_native<W: Write>(&self, w: W) -> io::Result<()> {
79        self.encode::<W>(w)
80    }
81}
82
83pub struct CompiledCharTable<'a> {
84    pub n_categories: u8,
85    pub default_id: u8,
86    pub invokes: &'a [u8],
87    pub groups: &'a [u8],
88    pub lengths: &'a [u8],
89    pub table: &'a [CategoryId],
90}
91
92impl<'a> CharCategorize for CompiledCharTable<'a> {
93    fn categorize(&self, ch: char) -> Category {
94        let id = self.category_id(ch) as usize;
95        Category {
96            invoke: self.invokes[id] != 0,
97            group: self.groups[id] != 0,
98            length: self.lengths[id],
99        }
100    }
101
102    fn category_id(&self, ch: char) -> CategoryId {
103        let ch = ch as u32;
104        if ch < ::std::u16::MAX as u32 {
105            self.table[ch as usize]
106        } else {
107            self.default_id
108        }
109    }
110}
111
112impl<'a> CompiledCharTable<'a> {
113    pub unsafe fn decode(bs: &'a [u8]) -> Self {
114        let ptr = bs.as_ptr() as *const u8;
115        let n = *ptr;
116        let default_id = *ptr.offset(1);
117        let ptr = ptr.offset(2);
118        let invokes = ::std::slice::from_raw_parts(ptr, n as usize);
119        let ptr = ptr.offset(n as isize);
120        let groups = ::std::slice::from_raw_parts(ptr, n as usize);
121        let ptr = ptr.offset(n as isize);
122        let lengths = ::std::slice::from_raw_parts(ptr, n as usize);
123        let ptr = ptr.offset(n as isize);
124        let table = ::std::slice::from_raw_parts(ptr, ::std::u16::MAX as usize);
125        CompiledCharTable {
126            n_categories: n,
127            default_id: default_id,
128            invokes: invokes,
129            groups: groups,
130            lengths: lengths,
131            table: table,
132        }
133    }
134}
135
136#[test]
137fn test_encode_decode() {
138    let mut table = CharTable {
139        default_id: 0,
140        categories: vec![Category {
141                             invoke: true,
142                             group: false,
143                             length: 0,
144                         },
145                         Category {
146                             invoke: false,
147                             group: true,
148                             length: 1,
149                         },
150                         Category {
151                             invoke: true,
152                             group: false,
153                             length: 2,
154                         }],
155        table: [0; ::std::u16::MAX as usize],
156    };
157    table.table['あ' as usize] = 1;
158    table.table['a' as usize] = 2;
159
160    let mut buf = Vec::new();
161    table.encode(&mut buf).unwrap();
162
163    let compiled = unsafe { CompiledCharTable::decode(&buf) };
164
165    let tests = vec!['0', 'あ', 'a'];
166
167    for ch in tests {
168        let category = compiled.categorize(ch);
169        assert_eq!(category, table.categorize(ch));
170    }
171}
172
173#[derive(Debug, Clone, PartialEq)]
174pub struct Entry<'a> {
175    pub left_id: u16,
176    pub right_id: u16,
177    pub weight: i16,
178    pub contents: &'a str,
179}
180
181impl<'a> Entry<'a> {
182    pub fn encode<W: Write, O: ByteOrder>(&self, mut w: W) -> io::Result<()> {
183        w.write_u16::<O>(self.left_id)?;
184        w.write_u16::<O>(self.right_id)?;
185        w.write_i16::<O>(self.weight)?;
186        w.write_u32::<O>(self.contents.len() as u32)?;
187        for &b in self.contents.as_bytes() {
188            w.write_u8(b)?;
189        }
190        Ok(())
191    }
192
193    pub fn encode_native<W: Write>(&self, w: W) -> io::Result<()> {
194        self.encode::<_, NativeEndian>(w)
195    }
196
197    pub unsafe fn decode(bs: &'a [u8]) -> Self {
198        let ptr = bs.as_ptr() as *const u16;
199        let left_id = *ptr;
200        let right_id = *ptr.offset(1);
201        let ptr = ptr.offset(2) as *const i16;
202        let weight = *ptr;
203        let ptr = ptr.offset(1) as *const u32;
204        let len = *ptr;
205        let ptr = ptr.offset(1) as *const u8;
206        let buf = ::std::slice::from_raw_parts(ptr, len as usize);
207        let contents = ::std::str::from_utf8_unchecked(buf);
208        Entry {
209            left_id: left_id,
210            right_id: right_id,
211            weight: weight,
212            contents: contents,
213        }
214    }
215}
216
217#[test]
218fn test_entry_encode() {
219    let e = Entry {
220        left_id: 0,
221        right_id: 1,
222        weight: -1,
223        contents: "てすと",
224    };
225    let mut buf = Vec::new();
226    e.encode_native(&mut buf).unwrap();
227    let actual = unsafe { Entry::decode(&buf) };
228    assert_eq!(actual, e);
229}
230
231pub trait UnknownDic: CharCategorize {
232    fn fetch_entries<'a>(&'a self, cate: CategoryId) -> Vec<Entry<'a>>;
233}
234
235pub struct UnkDic {
236    pub indices: Vec<u32>, // Category -> Index of initial entry
237    pub counts: Vec<u32>, // Category -> Counts of entries
238    pub entry_offsets: Vec<u32>,
239    pub entries: Vec<u8>,
240    pub categories: CharTable,
241}
242
243impl CharCategorize for UnkDic {
244    fn categorize(&self, ch: char) -> Category {
245        self.categories.categorize(ch)
246    }
247
248    fn category_id(&self, ch: char) -> CategoryId {
249        self.categories.category_id(ch)
250    }
251}
252
253impl UnknownDic for UnkDic {
254    fn fetch_entries<'a>(&'a self, cate: CategoryId) -> Vec<Entry<'a>> {
255        let count = self.counts[cate as usize] as usize;
256        let index = self.indices[cate as usize] as usize;
257        let offsets = &self.entry_offsets[index..index + count];
258        let mut results = Vec::with_capacity(count);
259        for &offset in offsets {
260            results.push(unsafe { Entry::decode(&self.entries[offset as usize..]) });
261        }
262        results
263    }
264}
265
266impl UnkDic {
267    pub fn build<'a>(entries: HashMap<CategoryId, Vec<Entry<'a>>>, char_table: CharTable) -> Self {
268        let n_cates = entries.len();
269        let mut indices = vec![0; n_cates];
270        let mut counts = vec![0; n_cates];
271        let mut offsets = Vec::new();
272        let mut entry_buf = Vec::new();
273        let mut index = 0;
274        for (id, entries) in entries {
275            indices[id as usize] = index;
276            counts[id as usize] = entries.len() as u32;
277            for entry in entries {
278                let offset = entry_buf.len() as u32;
279                offsets.push(offset);
280                index += 1;
281                entry.encode_native(&mut entry_buf).unwrap();
282            }
283        }
284        UnkDic {
285            indices: indices,
286            counts: counts,
287            entry_offsets: offsets,
288            entries: entry_buf,
289            categories: char_table,
290        }
291    }
292
293    pub fn encode<W: Write, O: ByteOrder>(&self, mut w: W) -> io::Result<()> {
294        w.write_u32::<O>(self.indices.len() as u32)?;
295        for i in &self.indices {
296            w.write_u32::<O>(*i)?;
297        }
298        w.write_u32::<O>(self.counts.len() as u32)?;
299        for i in &self.counts {
300            w.write_u32::<O>(*i)?;
301        }
302        w.write_u32::<O>(self.entry_offsets.len() as u32)?;
303        for i in &self.entry_offsets {
304            w.write_u32::<O>(*i)?;
305        }
306        w.write_u32::<O>(self.entries.len() as u32)?;
307        for b in &self.entries {
308            w.write_u8(*b)?;
309        }
310        self.categories.encode(w)
311    }
312
313    pub fn encode_native<W: Write>(&self, w: W) -> io::Result<()> {
314        self.encode::<_, NativeEndian>(w)
315    }
316}
317
318#[test]
319fn test_unk_dic() {
320    let stub_char_table = CharTable::new(0, Vec::new());
321    let mut entries = HashMap::new();
322    let es = vec!["a", "b"]
323        .into_iter()
324        .map(|s| {
325            Entry {
326                left_id: 0,
327                right_id: 1,
328                weight: -1,
329                contents: s,
330            }
331        })
332        .collect::<Vec<_>>();
333    entries.insert(0, es.clone());
334    entries.insert(1, es.clone());
335    let dic = UnkDic::build(entries, stub_char_table);
336    assert_eq!(dic.fetch_entries(0), es);
337    assert_eq!(dic.fetch_entries(1), es);
338}
339
340pub struct CompiledUnkDic<'a> {
341    indices: &'a [u32],
342    counts: &'a [u32],
343    entry_offsets: &'a [u32],
344    entries: &'a [u8],
345    categories: CompiledCharTable<'a>,
346}
347
348impl<'a> CharCategorize for CompiledUnkDic<'a> {
349    fn categorize(&self, ch: char) -> Category {
350        self.categories.categorize(ch)
351    }
352
353    fn category_id(&self, ch: char) -> CategoryId {
354        self.categories.category_id(ch)
355    }
356}
357
358impl<'a> UnknownDic for CompiledUnkDic<'a> {
359    fn fetch_entries<'b>(&'b self, cate: CategoryId) -> Vec<Entry<'b>> {
360        let count = self.counts[cate as usize] as usize;
361        let index = self.indices[cate as usize] as usize;
362        let offsets = &self.entry_offsets[index..index + count];
363        let mut results = Vec::with_capacity(count);
364        for &offset in offsets {
365            let e = unsafe { Entry::decode(&self.entries[offset as usize..]) };
366            results.push(e);
367        }
368        results
369    }
370}
371
372impl<'a> CompiledUnkDic<'a> {
373    pub unsafe fn decode(bs: &'a [u8]) -> Self {
374        let ptr = bs.as_ptr() as *const u32;
375        let ind_len = *ptr;
376        let ptr = ptr.offset(1) as *const u32;
377        let indices = ::std::slice::from_raw_parts(ptr, ind_len as usize);
378        let ptr = ptr.offset(ind_len as isize);
379        let counts_len = *ptr;
380        let ptr = ptr.offset(1) as *const u32;
381        let counts = ::std::slice::from_raw_parts(ptr, counts_len as usize);
382        let ptr = ptr.offset(counts_len as isize);
383        let entry_offsets_len = *ptr;
384        let ptr = ptr.offset(1) as *const u32;
385        let entry_offsets = ::std::slice::from_raw_parts(ptr, entry_offsets_len as usize);
386        let ptr = ptr.offset(entry_offsets_len as isize);
387        let entries_len = *ptr;
388        let ptr = ptr.offset(1) as *const u8;
389        let entries = ::std::slice::from_raw_parts(ptr, entries_len as usize);
390        let ptr = ptr.offset(entries_len as isize);
391        let ptr_diff = ptr as usize - bs.as_ptr() as usize;
392        let bs = &bs[ptr_diff..];
393        let categories = CompiledCharTable::decode(bs);
394
395        CompiledUnkDic {
396            indices: indices,
397            counts: counts,
398            entry_offsets: entry_offsets,
399            entries: entries,
400            categories: categories,
401        }
402    }
403}
404
405#[test]
406fn test_unk_dic_encode() {
407    let stub_char_table = CharTable::new(0, Vec::new());
408    let mut entries = HashMap::new();
409    let es = vec!["a", "b"]
410        .into_iter()
411        .map(|s| {
412            Entry {
413                left_id: 0,
414                right_id: 1,
415                weight: -1,
416                contents: s,
417            }
418        })
419        .collect::<Vec<_>>();
420    entries.insert(0, es.clone());
421    entries.insert(1, es.clone());
422    entries.insert(2, es.clone());
423    let dic = UnkDic::build(entries, stub_char_table);
424    let mut buf = Vec::new();
425    dic.encode_native(&mut buf).unwrap();
426    let compiled = unsafe { CompiledUnkDic::decode(&buf) };
427    assert_eq!(dic.fetch_entries(0), compiled.fetch_entries(0));
428    assert_eq!(dic.fetch_entries(1), compiled.fetch_entries(1));
429    assert_eq!(dic.fetch_entries(2), compiled.fetch_entries(2));
430}