1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
use std::ops::Deref; use tantivy_fst; use tantivy_fst::raw::Output; use crate::core::word_entry::WordEntry; const IPADIC_DATA: &'static [u8] = include_bytes!("../../lindera-ipadic/dict.fst"); const IPADIC_VALS: &'static [u8] = include_bytes!("../../lindera-ipadic/dict.vals"); pub struct PrefixDict<Data = &'static [u8]> { pub fst: tantivy_fst::raw::Fst<Data>, pub vals_data: Data, } impl Default for PrefixDict<&'static [u8]> { fn default() -> PrefixDict<&'static [u8]> { PrefixDict::from_static_slice(IPADIC_DATA, IPADIC_VALS).unwrap() } } impl PrefixDict<&'static [u8]> { pub fn from_static_slice( fst_data: &'static [u8], vals_data: &'static [u8], ) -> tantivy_fst::Result<PrefixDict> { let fst = tantivy_fst::raw::Fst::new(fst_data)?; Ok(PrefixDict { fst, vals_data }) } } impl<D: Deref<Target = [u8]>> PrefixDict<D> { pub fn prefix<'a>(&'a self, s: &'a str) -> impl Iterator<Item = (usize, WordEntry)> + 'a { s.as_bytes() .iter() .scan( (0, self.fst.root(), Output::zero()), move |(prefix_len, node, output), &byte| { if let Some(b_index) = node.find_input(byte) { let transition = node.transition(b_index); *prefix_len += 1; *output = output.cat(transition.out); *node = self.fst.node(transition.addr); return Some((node.is_final(), *prefix_len, output.value())); } None }, ) .filter_map(|(is_final, prefix_len, offset_len)| { if is_final { Some((prefix_len, offset_len)) } else { None } }) .flat_map(move |(prefix_len, offset_len)| { let len = offset_len & ((1u64 << 5) - 1u64); let offset = offset_len >> 5u64; let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN; let data: &[u8] = &self.vals_data[offset_bytes..]; (0..len as usize).map(move |i| { ( prefix_len, WordEntry::deserialize(&data[WordEntry::SERIALIZED_LEN * i..]), ) }) }) } } #[cfg(test)] mod tests { use crate::core::prefix_dict::PrefixDict; #[test] fn test_fst_prefix_2() { let prefix_dict = PrefixDict::default(); let count_prefix = prefix_dict.prefix("—でも").count(); assert_eq!(count_prefix, 1); } #[test] fn test_fst_prefix_tilde() { let prefix_dict = PrefixDict::default(); let count_prefix = prefix_dict.prefix("〜").count(); assert_eq!(count_prefix, 2); } #[test] fn test_fst_ikkagetsu() { let prefix_dict = PrefixDict::default(); let count_prefix = prefix_dict.prefix("ー").count(); assert_eq!(count_prefix, 0); let count_prefix = prefix_dict.prefix("ヶ月").count(); assert_eq!(count_prefix, 1); } #[test] fn test_fst_prefix_asterisk_symbol() { let prefix_dict = PrefixDict::default(); let count_prefix = prefix_dict.prefix("※").count(); assert_eq!(count_prefix, 1); } }