1extern crate memmap;
25
26use memmap::{Mmap, MmapOptions};
27use std::fs::File;
28use std::i16;
29use std::i32;
30use std::slice;
31use std::str;
32use std::sync::Arc;
33use std::u16;
34use std::u32;
35
36const MAX_GROUPING_SIZE: u32 = 24;
37
38#[allow(unused_imports)]
39use super::*;
40
41fn unpack_u32(mmap: &Mmap, i: usize) -> u32 {
42 u32::from_le_bytes([mmap[i], mmap[i + 1], mmap[i + 2], mmap[i + 3]])
43}
44
45fn unpack_i32(mmap: &Mmap, i: usize) -> i32 {
46 i32::from_le_bytes([mmap[i], mmap[i + 1], mmap[i + 2], mmap[i + 3]])
47}
48
49fn unpack_u16(mmap: &Mmap, i: usize) -> u16 {
50 u16::from_le_bytes([mmap[i], mmap[i + 1]])
51}
52
53fn unpack_i16(mmap: &Mmap, i: usize) -> i16 {
54 i16::from_le_bytes([mmap[i], mmap[i + 1]])
55}
56
57fn unpack_string(mmap: &Mmap, offset: usize) -> String {
58 let mut end = offset;
59 while mmap[end] != 0 {
60 end += 1;
61 }
62 str::from_utf8(&mmap[offset..end]).unwrap().to_string()
63}
64
65fn utf8_to_ucs2(s: &[u8], index: usize) -> (u16, usize) {
66 let ln = if (s[index] & 0b10000000) == 0b00000000 {
68 1
69 } else if (s[index] & 0b11100000) == 0b11000000 {
70 2
71 } else if (s[index] & 0b11110000) == 0b11100000 {
72 3
73 } else if (s[index] & 0b11111000) == 0b11110000 {
74 4
75 } else {
76 0
77 };
78
79 let mut ch32: u32;
80 match ln {
81 1 => ch32 = s[index + 0] as u32,
82 2 => {
83 ch32 = ((s[index + 0] & 0x1F) as u32) << 6;
84 ch32 |= (s[index + 1] & 0x3F) as u32;
85 }
86 3 => {
87 ch32 = ((s[index + 0] & 0x0F) as u32) << 12;
88 ch32 |= ((s[index + 1] & 0x3F) as u32) << 6;
89 ch32 |= (s[index + 2] & 0x3F) as u32;
90 }
91 4 => {
92 ch32 = ((s[index + 0] & 0x07) as u32) << 18;
93 ch32 |= ((s[index + 1] & 0x3F) as u32) << 12;
94 ch32 |= ((s[index + 2] & 0x3F) as u32) << 6;
95 ch32 |= (s[index + 3] & 0x03F) as u32;
96 }
97 _ => ch32 = 0,
98 }
99
100 let ch16 = if ch32 < 0x10000 {
102 ch32 as u16
103 } else {
104 ((((ch32 - 0x10000) / 0x400 + 0xD800) << 8) + ((ch32 - 0x10000) % 0x400 + 0xDC00)) as u16
105 };
106
107 (ch16, ln)
108}
109
110#[derive(Debug, Clone)]
116pub struct DicEntry {
117 pub original_ptr: *const u8,
118 pub original_len: usize,
119 pub lc_attr: u16,
120 pub rc_attr: u16,
121 pub posid: u16,
122 pub wcost: i16,
123 pub feature_ptr: *const u8,
124 pub feature_len: usize,
125 pub skip: bool,
126}
127
128impl DicEntry {
129 #[allow(dead_code)]
130 fn original_string(&self) -> String {
131 unsafe {
132 str::from_utf8(slice::from_raw_parts(self.original_ptr, self.original_len))
133 .unwrap()
134 .to_string()
135 }
136 }
137}
138
139#[derive(Clone)]
140pub struct CharProperty {
141 pub mmap: Arc<Mmap>,
142 pub category_names: Vec<String>,
143 pub offset: usize,
144}
145
146impl CharProperty {
147 pub fn open(dic_path: &str) -> Result<CharProperty, std::io::Error> {
148 let file = File::open(dic_path)?;
149 let mmap = unsafe { Arc::new(MmapOptions::new().map(&file)?) };
150 let mut category_names: Vec<String> = Vec::new();
151 let num_categories = unpack_u32(&mmap, 0);
152 for i in 0..num_categories {
153 category_names.push(unpack_string(&mmap, (4 + i * 32) as usize));
154 }
155
156 let char_property = CharProperty {
157 mmap: mmap,
158 category_names: category_names,
159 offset: (4 + num_categories * 32) as usize,
160 };
161 Ok(char_property)
162 }
163
164 pub fn get_char_info(&self, code_point: u16) -> (u32, u32, u32, u32, u32) {
165 let v = unpack_u32(&self.mmap, self.offset + (code_point as usize) * 4);
166 (
167 (v >> 18) & 0b11111111, v & 0b111111111111111111, (v >> 26) & 0b1111, (v >> 30) & 0b1, (v >> 31) & 0b1, )
173 }
174
175 pub fn get_group_length(&self, s: &[u8], default_type: u32) -> isize {
176 let mut i: usize = 0;
178 let mut char_count: u32 = 0;
179 while i < s.len() {
180 let (ch16, ln) = utf8_to_ucs2(s, i);
181 let (_, t, _, _, _) = self.get_char_info(ch16);
183
184 if ((1 << default_type) & t) != 0 {
185 i += ln;
186 char_count += 1;
187 if char_count > MAX_GROUPING_SIZE + 1 {
188 return -1;
189 }
190 } else {
191 break;
192 }
193 }
194 i as isize
195 }
196
197 pub fn get_count_length(&self, s: &[u8], default_type: u32, count: u32) -> isize {
198 let mut i: usize = 0;
200 for _ in 0..count {
201 if i >= s.len() {
202 return -1;
203 }
204 let (ch16, ln) = utf8_to_ucs2(s, i);
205 let (_, t, _, _, _) = self.get_char_info(ch16);
207 if ((1 << default_type) & t) == 0 {
208 return -1;
209 }
210
211 i += ln;
212 }
213 i as isize
214 }
215
216 pub fn get_unknown_lengths(&self, s: &[u8]) -> (u32, Vec<usize>, bool) {
217 let mut ln_vec: Vec<usize> = Vec::new();
219 let (ch16, first_ln) = utf8_to_ucs2(s, 0);
220 let (default_type, _, count, group, invoke) = self.get_char_info(ch16);
221 if group != 0 {
222 let ln = self.get_group_length(s, default_type);
223 if ln > 0 {
224 ln_vec.push(ln as usize);
225 }
226 }
227 if count != 0 {
228 for n in 0..count {
229 let ln = self.get_count_length(s, default_type, n + 1);
230 if ln < 0 {
231 break;
232 }
233 ln_vec.push(ln as usize);
234 }
235 }
236
237 if ln_vec.len() == 0 {
238 ln_vec.push(first_ln);
239 }
240
241 (default_type, ln_vec, invoke == 1)
243 }
244}
245
246#[derive(Clone)]
247pub struct MeCabDic {
248 mmap: Arc<Mmap>,
249 da_offset: u32,
250 token_offset: u32,
251 feature_offset: u32,
252}
253
254impl MeCabDic {
255 pub fn open(dic_path: &str) -> Result<MeCabDic, std::io::Error> {
256 let file = File::open(dic_path)?;
257 let mmap = unsafe { Arc::new(MmapOptions::new().map(&file)?) };
258 let _version = unpack_u32(&mmap, 4);
260 let _dictype = unpack_u32(&mmap, 8);
261 let _lexsize = unpack_u32(&mmap, 12);
262 let _lsize = unpack_u32(&mmap, 16);
263 let _rsize = unpack_u32(&mmap, 20);
264 let dsize = unpack_u32(&mmap, 24);
265 let tsize = unpack_u32(&mmap, 28);
266 let _fsize = unpack_u32(&mmap, 32);
267 let _dummy = unpack_u32(&mmap, 36);
268
269 let dic = MeCabDic {
270 mmap: mmap,
271 da_offset: 72,
272 token_offset: 72 + dsize,
273 feature_offset: 72 + dsize + tsize,
274 };
275 Ok(dic)
276 }
277
278 fn base_check(&self, idx: u32) -> (i32, u32) {
279 let i: usize = (self.da_offset + idx * 8) as usize;
280 (unpack_i32(&self.mmap, i), unpack_u32(&self.mmap, i + 4))
281 }
282
283 pub fn exact_match_search(&self, s: &[u8]) -> i32 {
284 let mut v = -1;
285 let mut p: u32;
286
287 let (mut b, _) = self.base_check(0);
288 for (_i, &item) in s.iter().enumerate() {
289 p = (b + (item as i32)) as u32 + 1;
290 let (base, check) = self.base_check(p);
291 if b == (check as i32) {
292 b = base;
293 } else {
294 return v;
295 }
296 }
297
298 p = b as u32;
299 let (n, check) = self.base_check(p);
300 if b == (check as i32) && n < 0 {
301 v = -n - 1;
302 }
303 v
304 }
305
306 pub fn common_prefix_search(&self, s: &[u8]) -> Vec<(i32, usize)> {
307 let mut results: Vec<(i32, usize)> = Vec::new();
308 let mut p: u32;
309
310 let (mut b, _) = self.base_check(0);
311 for (i, &item) in s.iter().enumerate() {
312 p = b as u32;
313 let (n, check) = self.base_check(p);
314 if b == (check as i32) && n < 0 {
315 results.push((-n - 1, i as usize));
316 }
317 p = (b + (item as i32)) as u32 + 1;
318 let (base, check) = self.base_check(p);
319 if b == (check as i32) {
320 b = base;
321 } else {
322 return results;
323 }
324 }
325 p = b as u32;
326
327 let (n, check) = self.base_check(p);
328 if b == (check as i32) && n < 0 {
329 results.push((-n - 1, s.len() as usize));
330 }
331
332 results
333 }
334
335 fn get_entries_by_index(
336 &self,
337 idx: u32,
338 count: u32,
339 s: &[u8],
340 s_len: usize,
341 skip: bool,
342 ) -> Vec<DicEntry> {
343 let mut results: Vec<DicEntry> = Vec::new();
344 for i in 0..count {
345 let offset: usize = (self.token_offset + (idx + i) * 16) as usize;
346 let lc_attr = unpack_u16(&self.mmap, offset);
347 let rc_attr = unpack_u16(&self.mmap, offset + 2);
348 let posid = unpack_u16(&self.mmap, offset + 4);
349 let wcost = unpack_i16(&self.mmap, offset + 6);
350
351 let feature = unpack_u32(&self.mmap, offset + 8);
352 let start = (self.feature_offset + feature) as usize;
353 let mut end = start;
354 while self.mmap[end] != 0 {
355 end += 1;
356 }
357
358 results.push(DicEntry {
359 original_ptr: s.as_ptr(),
360 original_len: s_len,
361 lc_attr: lc_attr,
362 rc_attr: rc_attr,
363 posid: posid,
364 wcost: wcost,
365 feature_ptr: (&self.mmap[start..]).as_ptr(),
366 feature_len: end - start,
367 skip: skip,
368 });
369 }
370
371 results
372 }
373
374 fn get_entries(&self, result: u32, s: &[u8], s_len: usize, skip: bool) -> Vec<DicEntry> {
375 let index = result >> 8;
376 let count = result & 0xFF;
377 self.get_entries_by_index(index, count, s, s_len, skip)
378 }
379
380 pub fn lookup(&self, s: &[u8]) -> Vec<DicEntry> {
381 let mut results: Vec<DicEntry> = Vec::new();
382 for (result, len) in self.common_prefix_search(s).iter() {
383 let index = (*result >> 8) as u32;
384 let count = (result & 0xFF) as u32;
385 let mut new_results = self.get_entries_by_index(index, count, s, *len, false);
386 results.append(&mut new_results);
387 }
388 results
389 }
390
391 pub fn lookup_unknowns(&self, s: &[u8], cp: &CharProperty) -> (Vec<DicEntry>, bool) {
392 let (default_type, ln_vec, invoke) = cp.get_unknown_lengths(s);
393 let category_name = cp.category_names[default_type as usize].as_bytes();
394 let result = self.exact_match_search(category_name);
395 let mut results: Vec<DicEntry> = Vec::new();
396 for i in ln_vec {
397 let mut new_results = self.get_entries(result as u32, s, i, category_name == b"SPACE");
398 results.append(&mut new_results);
399 }
400 (results, invoke)
401 }
402}
403
404#[derive(Clone)]
405pub struct Matrix {
406 mmap: Arc<Mmap>,
407 lsize: usize,
408 }
410
411impl Matrix {
412 pub fn open(dic_path: &str) -> Result<Matrix, std::io::Error> {
413 let file = File::open(dic_path)?;
414 let mmap = unsafe { Arc::new(MmapOptions::new().map(&file)?) };
415 let lsize = unpack_u16(&mmap, 0) as usize;
416 let _rsize = unpack_u16(&mmap, 2) as usize;
417
418 let matrix = Matrix {
419 mmap: mmap,
420 lsize: lsize,
421 };
423 Ok(matrix)
424 }
425
426 pub fn get_trans_cost(&self, id1: u16, id2: u16) -> i32 {
427 let id1 = id1 as usize;
428 let id2 = id2 as usize;
429
430 unpack_i16(&self.mmap, ((id2 * self.lsize + id1) * 2 + 4) as usize) as i32
431 }
432}
433
434#[test]
435fn test_dic_open() {
436 assert!(
437 MeCabDic::open("/something/wrong/path/sys.dic").is_err(),
438 "Error not occured."
439 );
440
441 let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
442 let result = MeCabDic::open(&mecabrc::get_dic_path(&rc_map, "sys.dic"));
443 assert!(!result.is_err(), "Can't open dict file.");
444 let _sys_dic = result.unwrap();
445}
446
447#[test]
448fn test_char_property() {
449 let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
450 let cp = CharProperty::open(&mecabrc::get_dic_path(&rc_map, "char.bin")).unwrap();
451
452 assert_eq!(
453 cp.category_names,
454 vec![
455 "DEFAULT",
456 "SPACE",
457 "KANJI",
458 "SYMBOL",
459 "NUMERIC",
460 "ALPHA",
461 "HIRAGANA",
462 "KATAKANA",
463 "KANJINUMERIC",
464 "GREEK",
465 "CYRILLIC"
466 ]
467 );
468
469 assert_eq!(cp.get_char_info(0), (0, 1, 0, 1, 0)); assert_eq!(cp.get_char_info(0x20), (1, 2, 0, 1, 0)); assert_eq!(cp.get_char_info(0x09), (1, 2, 0, 1, 0)); assert_eq!(cp.get_char_info(0x6f22), (2, 4, 2, 0, 0)); assert_eq!(cp.get_char_info(0x3007), (3, 264, 0, 1, 1)); assert_eq!(cp.get_char_info(0x31), (4, 16, 0, 1, 1)); assert_eq!(cp.get_char_info(0x3042), (6, 64, 2, 1, 0)); assert_eq!(cp.get_char_info(0x4e00), (8, 260, 0, 1, 1)); }
479
480#[test]
481fn test_get_trans_cost() {
482 let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
483 let matrix = Matrix::open(&mecabrc::get_dic_path(&rc_map, "matrix.bin")).unwrap();
484 assert_eq!(matrix.get_trans_cost(555, 1283), 340);
485 assert_eq!(matrix.get_trans_cost(10, 1293), -1376);
486}
487
488#[allow(dead_code)]
489fn assert_entry(e: &DicEntry, lc_attr: u16, rc_attr: u16, posid: u16, wcost: i16) {
490 assert_eq!(e.lc_attr, lc_attr);
491 assert_eq!(e.rc_attr, rc_attr);
492 assert_eq!(e.posid, posid);
493 assert_eq!(e.wcost, wcost);
494}
495
496#[test]
497fn test_lookup() {
498 let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
499 let sys_dic = MeCabDic::open(&mecabrc::get_dic_path(&rc_map, "sys.dic")).unwrap();
500 let sb = "すもももももももものうち".as_bytes();
501
502 let r = sys_dic.common_prefix_search(&sb[0..]);
503 assert_eq!(r.len(), 3);
504 let entries = sys_dic.lookup(sb);
510 assert_eq!(entries.len(), 9);
511 }
522
523#[test]
524fn test_lookup_unknowns() {
525 let rc_map = mecabrc::rc_map(&mecabrc::find_mecabrc().unwrap()).unwrap();
526 let unk_dic = MeCabDic::open(&mecabrc::get_dic_path(&rc_map, "unk.dic")).unwrap();
527 let cp = CharProperty::open(&mecabrc::get_dic_path(&rc_map, "char.bin")).unwrap();
528
529 assert_eq!(unk_dic.exact_match_search(b"SPACE"), 9729);
530
531 let (entries, invoke) = unk_dic.lookup_unknowns("1967年".as_bytes(), &cp);
532 assert_eq!(entries.len(), 1);
533 assert_eq!(invoke, true);
534 assert_eq!(entries[0].original_string(), "1967".to_string())
535}