awabi/
dic.rs

1/*
2*MIT License
3*
4*Copyright (c) 2020 Hajime Nakagami
5*
6*Permission is hereby granted, free of charge, to any person obtaining a copy
7*of this software and associated documentation files (the "Software"), to deal
8*in the Software without restriction, including without limitation the rights
9*to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10*copies of the Software, and to permit persons to whom the Software is
11*furnished to do so, subject to the following conditions:
12*
13*The above copyright notice and this permission notice shall be included in all
14*copies or substantial portions of the Software.
15*
16*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*SOFTWARE.
23*/
24extern crate memmap;
25
26use memmap::{Mmap, MmapOptions};
27use std::fs::File;
28use std::i16;
29use std::i32;
30use std::slice;
31use std::str;
32use std::sync::Arc;
33use std::u16;
34use std::u32;
35
36const MAX_GROUPING_SIZE: u32 = 24;
37
38#[allow(unused_imports)]
39use super::*;
40
41fn unpack_u32(mmap: &Mmap, i: usize) -> u32 {
42    u32::from_le_bytes([mmap[i], mmap[i + 1], mmap[i + 2], mmap[i + 3]])
43}
44
45fn unpack_i32(mmap: &Mmap, i: usize) -> i32 {
46    i32::from_le_bytes([mmap[i], mmap[i + 1], mmap[i + 2], mmap[i + 3]])
47}
48
49fn unpack_u16(mmap: &Mmap, i: usize) -> u16 {
50    u16::from_le_bytes([mmap[i], mmap[i + 1]])
51}
52
53fn unpack_i16(mmap: &Mmap, i: usize) -> i16 {
54    i16::from_le_bytes([mmap[i], mmap[i + 1]])
55}
56
57fn unpack_string(mmap: &Mmap, offset: usize) -> String {
58    let mut end = offset;
59    while mmap[end] != 0 {
60        end += 1;
61    }
62    str::from_utf8(&mmap[offset..end]).unwrap().to_string()
63}
64
65fn utf8_to_ucs2(s: &[u8], index: usize) -> (u16, usize) {
66    // utf8 to ucs2(16bit) code and it's array size
67    let ln = if (s[index] & 0b10000000) == 0b00000000 {
68        1
69    } else if (s[index] & 0b11100000) == 0b11000000 {
70        2
71    } else if (s[index] & 0b11110000) == 0b11100000 {
72        3
73    } else if (s[index] & 0b11111000) == 0b11110000 {
74        4
75    } else {
76        0
77    };
78
79    let mut ch32: u32;
80    match ln {
81        1 => ch32 = s[index + 0] as u32,
82        2 => {
83            ch32 = ((s[index + 0] & 0x1F) as u32) << 6;
84            ch32 |= (s[index + 1] & 0x3F) as u32;
85        }
86        3 => {
87            ch32 = ((s[index + 0] & 0x0F) as u32) << 12;
88            ch32 |= ((s[index + 1] & 0x3F) as u32) << 6;
89            ch32 |= (s[index + 2] & 0x3F) as u32;
90        }
91        4 => {
92            ch32 = ((s[index + 0] & 0x07) as u32) << 18;
93            ch32 |= ((s[index + 1] & 0x3F) as u32) << 12;
94            ch32 |= ((s[index + 2] & 0x3F) as u32) << 6;
95            ch32 |= (s[index + 3] & 0x03F) as u32;
96        }
97        _ => ch32 = 0,
98    }
99
100    // ucs4 to ucs2
101    let ch16 = if ch32 < 0x10000 {
102        ch32 as u16
103    } else {
104        ((((ch32 - 0x10000) / 0x400 + 0xD800) << 8) + ((ch32 - 0x10000) % 0x400 + 0xDC00)) as u16
105    };
106
107    (ch16, ln)
108}
109
110//fn bytes_to_str(bytes: &[u8]) -> String {
111//    let res = bytes.iter().map(|&s| s as char).collect::<String>();
112//    String::from_utf8(bytes.to_vec()).unwrap()
113//}
114
115#[derive(Debug, Clone)]
116pub struct DicEntry {
117    pub original_ptr: *const u8,
118    pub original_len: usize,
119    pub lc_attr: u16,
120    pub rc_attr: u16,
121    pub posid: u16,
122    pub wcost: i16,
123    pub feature_ptr: *const u8,
124    pub feature_len: usize,
125    pub skip: bool,
126}
127
128impl DicEntry {
129    #[allow(dead_code)]
130    fn original_string(&self) -> String {
131        unsafe {
132            str::from_utf8(slice::from_raw_parts(self.original_ptr, self.original_len))
133                .unwrap()
134                .to_string()
135        }
136    }
137}
138
139#[derive(Clone)]
140pub struct CharProperty {
141    pub mmap: Arc<Mmap>,
142    pub category_names: Vec<String>,
143    pub offset: usize,
144}
145
146impl CharProperty {
147    pub fn open(dic_path: &str) -> Result<CharProperty, std::io::Error> {
148        let file = File::open(dic_path)?;
149        let mmap = unsafe { Arc::new(MmapOptions::new().map(&file)?) };
150        let mut category_names: Vec<String> = Vec::new();
151        let num_categories = unpack_u32(&mmap, 0);
152        for i in 0..num_categories {
153            category_names.push(unpack_string(&mmap, (4 + i * 32) as usize));
154        }
155
156        let char_property = CharProperty {
157            mmap: mmap,
158            category_names: category_names,
159            offset: (4 + num_categories * 32) as usize,
160        };
161        Ok(char_property)
162    }
163
164    pub fn get_char_info(&self, code_point: u16) -> (u32, u32, u32, u32, u32) {
165        let v = unpack_u32(&self.mmap, self.offset + (code_point as usize) * 4);
166        (
167            (v >> 18) & 0b11111111,   // default_type
168            v & 0b111111111111111111, // type
169            (v >> 26) & 0b1111,       // char count
170            (v >> 30) & 0b1,          // group
171            (v >> 31) & 0b1,          // invoke
172        )
173    }
174
175    pub fn get_group_length(&self, s: &[u8], default_type: u32) -> isize {
176        // aggregate same char types and return length
177        let mut i: usize = 0;
178        let mut char_count: u32 = 0;
179        while i < s.len() {
180            let (ch16, ln) = utf8_to_ucs2(s, i);
181            // default_type, type, count, group, invoke
182            let (_, t, _, _, _) = self.get_char_info(ch16);
183
184            if ((1 << default_type) & t) != 0 {
185                i += ln;
186                char_count += 1;
187                if char_count > MAX_GROUPING_SIZE + 1 {
188                    return -1;
189                }
190            } else {
191                break;
192            }
193        }
194        i as isize
195    }
196
197    pub fn get_count_length(&self, s: &[u8], default_type: u32, count: u32) -> isize {
198        // get char count bytes length
199        let mut i: usize = 0;
200        for _ in 0..count {
201            if i >= s.len() {
202                return -1;
203            }
204            let (ch16, ln) = utf8_to_ucs2(s, i);
205            // default_type, type, count, group, invoke
206            let (_, t, _, _, _) = self.get_char_info(ch16);
207            if ((1 << default_type) & t) == 0 {
208                return -1;
209            }
210
211            i += ln;
212        }
213        i as isize
214    }
215
216    pub fn get_unknown_lengths(&self, s: &[u8]) -> (u32, Vec<usize>, bool) {
217        // get unknown word bytes length vector
218        let mut ln_vec: Vec<usize> = Vec::new();
219        let (ch16, first_ln) = utf8_to_ucs2(s, 0);
220        let (default_type, _, count, group, invoke) = self.get_char_info(ch16);
221        if group != 0 {
222            let ln = self.get_group_length(s, default_type);
223            if ln > 0 {
224                ln_vec.push(ln as usize);
225            }
226        }
227        if count != 0 {
228            for n in 0..count {
229                let ln = self.get_count_length(s, default_type, n + 1);
230                if ln < 0 {
231                    break;
232                }
233                ln_vec.push(ln as usize);
234            }
235        }
236
237        if ln_vec.len() == 0 {
238            ln_vec.push(first_ln);
239        }
240
241        // type, vector of length, invoke always flag
242        (default_type, ln_vec, invoke == 1)
243    }
244}
245
246#[derive(Clone)]
247pub struct MeCabDic {
248    mmap: Arc<Mmap>,
249    da_offset: u32,
250    token_offset: u32,
251    feature_offset: u32,
252}
253
254impl MeCabDic {
255    pub fn open(dic_path: &str) -> Result<MeCabDic, std::io::Error> {
256        let file = File::open(dic_path)?;
257        let mmap = unsafe { Arc::new(MmapOptions::new().map(&file)?) };
258        // dic size unpack_u32(&mmap, 0) ^ 0xef718f77;
259        let _version = unpack_u32(&mmap, 4);
260        let _dictype = unpack_u32(&mmap, 8);
261        let _lexsize = unpack_u32(&mmap, 12);
262        let _lsize = unpack_u32(&mmap, 16);
263        let _rsize = unpack_u32(&mmap, 20);
264        let dsize = unpack_u32(&mmap, 24);
265        let tsize = unpack_u32(&mmap, 28);
266        let _fsize = unpack_u32(&mmap, 32);
267        let _dummy = unpack_u32(&mmap, 36);
268
269        let dic = MeCabDic {
270            mmap: mmap,
271            da_offset: 72,
272            token_offset: 72 + dsize,
273            feature_offset: 72 + dsize + tsize,
274        };
275        Ok(dic)
276    }
277
278    fn base_check(&self, idx: u32) -> (i32, u32) {
279        let i: usize = (self.da_offset + idx * 8) as usize;
280        (unpack_i32(&self.mmap, i), unpack_u32(&self.mmap, i + 4))
281    }
282
283    pub fn exact_match_search(&self, s: &[u8]) -> i32 {
284        let mut v = -1;
285        let mut p: u32;
286
287        let (mut b, _) = self.base_check(0);
288        for (_i, &item) in s.iter().enumerate() {
289            p = (b + (item as i32)) as u32 + 1;
290            let (base, check) = self.base_check(p);
291            if b == (check as i32) {
292                b = base;
293            } else {
294                return v;
295            }
296        }
297
298        p = b as u32;
299        let (n, check) = self.base_check(p);
300        if b == (check as i32) && n < 0 {
301            v = -n - 1;
302        }
303        v
304    }
305
306    pub fn common_prefix_search(&self, s: &[u8]) -> Vec<(i32, usize)> {
307        let mut results: Vec<(i32, usize)> = Vec::new();
308        let mut p: u32;
309
310        let (mut b, _) = self.base_check(0);
311        for (i, &item) in s.iter().enumerate() {
312            p = b as u32;
313            let (n, check) = self.base_check(p);
314            if b == (check as i32) && n < 0 {
315                results.push((-n - 1, i as usize));
316            }
317            p = (b + (item as i32)) as u32 + 1;
318            let (base, check) = self.base_check(p);
319            if b == (check as i32) {
320                b = base;
321            } else {
322                return results;
323            }
324        }
325        p = b as u32;
326
327        let (n, check) = self.base_check(p);
328        if b == (check as i32) && n < 0 {
329            results.push((-n - 1, s.len() as usize));
330        }
331
332        results
333    }
334
335    fn get_entries_by_index(
336        &self,
337        idx: u32,
338        count: u32,
339        s: &[u8],
340        s_len: usize,
341        skip: bool,
342    ) -> Vec<DicEntry> {
343        let mut results: Vec<DicEntry> = Vec::new();
344        for i in 0..count {
345            let offset: usize = (self.token_offset + (idx + i) * 16) as usize;
346            let lc_attr = unpack_u16(&self.mmap, offset);
347            let rc_attr = unpack_u16(&self.mmap, offset + 2);
348            let posid = unpack_u16(&self.mmap, offset + 4);
349            let wcost = unpack_i16(&self.mmap, offset + 6);
350
351            let feature = unpack_u32(&self.mmap, offset + 8);
352            let start = (self.feature_offset + feature) as usize;
353            let mut end = start;
354            while self.mmap[end] != 0 {
355                end += 1;
356            }
357
358            results.push(DicEntry {
359                original_ptr: s.as_ptr(),
360                original_len: s_len,
361                lc_attr: lc_attr,
362                rc_attr: rc_attr,
363                posid: posid,
364                wcost: wcost,
365                feature_ptr: (&self.mmap[start..]).as_ptr(),
366                feature_len: end - start,
367                skip: skip,
368            });
369        }
370
371        results
372    }
373
374    fn get_entries(&self, result: u32, s: &[u8], s_len: usize, skip: bool) -> Vec<DicEntry> {
375        let index = result >> 8;
376        let count = result & 0xFF;
377        self.get_entries_by_index(index, count, s, s_len, skip)
378    }
379
380    pub fn lookup(&self, s: &[u8]) -> Vec<DicEntry> {
381        let mut results: Vec<DicEntry> = Vec::new();
382        for (result, len) in self.common_prefix_search(s).iter() {
383            let index = (*result >> 8) as u32;
384            let count = (result & 0xFF) as u32;
385            let mut new_results = self.get_entries_by_index(index, count, s, *len, false);
386            results.append(&mut new_results);
387        }
388        results
389    }
390
391    pub fn lookup_unknowns(&self, s: &[u8], cp: &CharProperty) -> (Vec<DicEntry>, bool) {
392        let (default_type, ln_vec, invoke) = cp.get_unknown_lengths(s);
393        let category_name = cp.category_names[default_type as usize].as_bytes();
394        let result = self.exact_match_search(category_name);
395        let mut results: Vec<DicEntry> = Vec::new();
396        for i in ln_vec {
397            let mut new_results = self.get_entries(result as u32, s, i, category_name == b"SPACE");
398            results.append(&mut new_results);
399        }
400        (results, invoke)
401    }
402}
403
404#[derive(Clone)]
405pub struct Matrix {
406    mmap: Arc<Mmap>,
407    lsize: usize,
408    //    pub rsize: usize,
409}
410
411impl Matrix {
412    pub fn open(dic_path: &str) -> Result<Matrix, std::io::Error> {
413        let file = File::open(dic_path)?;
414        let mmap = unsafe { Arc::new(MmapOptions::new().map(&file)?) };
415        let lsize = unpack_u16(&mmap, 0) as usize;
416        let _rsize = unpack_u16(&mmap, 2) as usize;
417
418        let matrix = Matrix {
419            mmap: mmap,
420            lsize: lsize,
421            //            rsize: rsize,
422        };
423        Ok(matrix)
424    }
425
426    pub fn get_trans_cost(&self, id1: u16, id2: u16) -> i32 {
427        let id1 = id1 as usize;
428        let id2 = id2 as usize;
429
430        unpack_i16(&self.mmap, ((id2 * self.lsize + id1) * 2 + 4) as usize) as i32
431    }
432}
433
434#[test]
435fn test_dic_open() {
436    assert!(
437        MeCabDic::open("/something/wrong/path/sys.dic").is_err(),
438        "Error not occured."
439    );
440
441    let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
442    let result = MeCabDic::open(&mecabrc::get_dic_path(&rc_map, "sys.dic"));
443    assert!(!result.is_err(), "Can't open dict file.");
444    let _sys_dic = result.unwrap();
445}
446
447#[test]
448fn test_char_property() {
449    let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
450    let cp = CharProperty::open(&mecabrc::get_dic_path(&rc_map, "char.bin")).unwrap();
451
452    assert_eq!(
453        cp.category_names,
454        vec![
455            "DEFAULT",
456            "SPACE",
457            "KANJI",
458            "SYMBOL",
459            "NUMERIC",
460            "ALPHA",
461            "HIRAGANA",
462            "KATAKANA",
463            "KANJINUMERIC",
464            "GREEK",
465            "CYRILLIC"
466        ]
467    );
468
469    // (default_type, type, length, group, invoke)
470    assert_eq!(cp.get_char_info(0), (0, 1, 0, 1, 0)); // DEFAULT
471    assert_eq!(cp.get_char_info(0x20), (1, 2, 0, 1, 0)); // SPACE
472    assert_eq!(cp.get_char_info(0x09), (1, 2, 0, 1, 0)); // SPACE
473    assert_eq!(cp.get_char_info(0x6f22), (2, 4, 2, 0, 0)); // KANJI 漢
474    assert_eq!(cp.get_char_info(0x3007), (3, 264, 0, 1, 1)); // SYMBOL
475    assert_eq!(cp.get_char_info(0x31), (4, 16, 0, 1, 1)); // NUMERIC 1
476    assert_eq!(cp.get_char_info(0x3042), (6, 64, 2, 1, 0)); // HIRAGANA あ
477    assert_eq!(cp.get_char_info(0x4e00), (8, 260, 0, 1, 1)); // KANJINUMERIC 一
478}
479
480#[test]
481fn test_get_trans_cost() {
482    let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
483    let matrix = Matrix::open(&mecabrc::get_dic_path(&rc_map, "matrix.bin")).unwrap();
484    assert_eq!(matrix.get_trans_cost(555, 1283), 340);
485    assert_eq!(matrix.get_trans_cost(10, 1293), -1376);
486}
487
488#[allow(dead_code)]
489fn assert_entry(e: &DicEntry, lc_attr: u16, rc_attr: u16, posid: u16, wcost: i16) {
490    assert_eq!(e.lc_attr, lc_attr);
491    assert_eq!(e.rc_attr, rc_attr);
492    assert_eq!(e.posid, posid);
493    assert_eq!(e.wcost, wcost);
494}
495
496#[test]
497fn test_lookup() {
498    let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
499    let sys_dic = MeCabDic::open(&mecabrc::get_dic_path(&rc_map, "sys.dic")).unwrap();
500    let sb = "すもももももももものうち".as_bytes();
501
502    let r = sys_dic.common_prefix_search(&sb[0..]);
503    assert_eq!(r.len(), 3);
504    // Ubuntu 18.04's default sys.dic
505    //    assert_eq!(r[0], (8849415, 3));
506    //    assert_eq!(r[1], (9258497, 6));
507    //    assert_eq!(r[2], (9259009, 9));
508
509    let entries = sys_dic.lookup(sb);
510    assert_eq!(entries.len(), 9);
511    // Ubuntu 18.04's default sys.dic
512    //    assert_entry(&entries[0], 560, 560, 30, 10247);
513    //    assert_entry(&entries[1], 879, 879, 32, 11484);
514    //    assert_entry(&entries[2], 777, 777, 31, 9683);
515    //    assert_entry(&entries[3], 602, 602, 31, 9683);
516    //    assert_entry(&entries[4], 601, 601, 31, 9683);
517    //    assert_entry(&entries[5], 1285, 1285, 38, 10036);
518    //    assert_entry(&entries[6], 11, 11, 10, 9609);
519    //    assert_entry(&entries[7], 763, 763, 31, 9412);
520    //    assert_entry(&entries[8], 1285, 1285, 38, 7546);
521}
522
523#[test]
524fn test_lookup_unknowns() {
525    let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
526    let unk_dic = MeCabDic::open(&mecabrc::get_dic_path(&rc_map, "unk.dic")).unwrap();
527    let cp = CharProperty::open(&mecabrc::get_dic_path(&rc_map, "char.bin")).unwrap();
528
529    assert_eq!(unk_dic.exact_match_search(b"SPACE"), 9729);
530
531    let (entries, invoke) = unk_dic.lookup_unknowns("1967年".as_bytes(), &cp);
532    assert_eq!(entries.len(), 1);
533    assert_eq!(invoke, true);
534    assert_eq!(entries[0].original_string(), "1967".to_string())
535}