jpreprocess_dictionary_builder/to_csv/
mod.rs1use crate::build_dict::WordEntryMap;
2use byteorder::{ByteOrder, LittleEndian};
3use jpreprocess_dictionary::DictionarySerializer;
4use lindera_core::{prefix_dict::PrefixDict, word_entry::WordEntry, LinderaResult};
5use std::collections::BTreeMap;
6
7use self::da::DoubleArrayParser;
8
9mod da;
10
11pub fn dict_to_csv<S: DictionarySerializer>(
16 prefix_dict: &PrefixDict,
17 words_idx_data: &[u8],
18 words_data: &[u8],
19 serializer: &S,
20) -> LinderaResult<Vec<String>> {
21 let word_entry_map = inverse_prefix_dict(prefix_dict, true);
22
23 let rows: Vec<(String, WordEntry)> = word_entry_map
24 .into_iter()
25 .flat_map(|(string, word_entries)| {
26 word_entries
27 .into_iter()
28 .map(move |word_entry| (string.to_owned(), word_entry))
29 })
30 .collect();
31
32 let words: Vec<String> = rows.iter().map(|w| w.0.to_owned()).collect();
33
34 Ok(rows
35 .into_iter()
36 .zip(words_to_csv(words_idx_data, words_data, words, serializer)?)
37 .map(|((string, word_entry), right)| {
38 format!(
39 "{},{},{},{},{}",
40 string, word_entry.left_id, word_entry.right_id, word_entry.word_cost, right
41 )
42 })
43 .collect())
44}
45
46pub fn inverse_prefix_dict(prefix_dict: &PrefixDict, is_system: bool) -> WordEntryMap {
51 let mut result: WordEntryMap = BTreeMap::new();
52
53 let keyset = DoubleArrayParser(&prefix_dict.da.0).inverse_da();
54 for (s, offset_len) in keyset {
55 let len = offset_len & 0x1f;
56 let offset = offset_len >> 5;
57 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
58 let data: &[u8] = &prefix_dict.vals_data[offset_bytes..];
59 result.insert(
60 s,
61 (0..len as usize)
62 .map(move |i| {
63 WordEntry::deserialize(&data[WordEntry::SERIALIZED_LEN * i..], is_system)
64 })
65 .collect(),
66 );
67 }
68
69 result
70}
71
72pub fn words_to_csv<S: DictionarySerializer>(
77 words_idx_data: &[u8],
78 words_data: &[u8],
79 words: Vec<String>,
80 serializer: &S,
81) -> LinderaResult<Vec<String>> {
82 let words_count = words_idx_data.len() / 4;
83 assert_eq!(words_count, words.len());
84
85 let mut result = vec![];
86 for (i, word) in words.into_iter().enumerate() {
87 let idx = LittleEndian::read_u32(&words_idx_data[i * 4..(i + 1) * 4]) as usize;
88 let deserialized = serializer.deserialize_with_string(&words_data[idx..], word)?;
89 result.push(deserialized);
90 }
91 Ok(result)
92}
93
94#[cfg(test)]
95mod tests {
96 use std::error::Error;
97
98 use jpreprocess_dictionary::serializer::lindera::LinderaSerializer;
99
100 use crate::ipadic_builder::IpadicBuilder;
101
102 use super::dict_to_csv;
103
104 #[test]
105 fn inverse() -> Result<(), Box<dyn Error>> {
106 let rows=[
107 "キログラム,1360,1360,7944,名詞,接尾,助数詞,*,*,*,キログラム,キログラム,キログラム,3/5,C1,-1",
108 "生麦生米生卵,3,3,10000,感動詞,*,*,*,*,*,生麦:生米:生卵,ナマムギ:ナマゴメ:ナマタマゴ,ナマムギ:ナマゴメ:ナマタマゴ,2/4:2/4:3/5,*,-1",
109 "日本,1354,1354,10787,名詞,固有名詞,地域,国,*,*,日本,ニホン,ニホン,2/3,C1,-1"
110 ];
111
112 let rows_split: Vec<Vec<&str>> = rows.map(|s| s.split(',').collect()).to_vec();
113
114 let builder = IpadicBuilder::new(Box::new(LinderaSerializer));
115 let user_dict = builder.build_user_dict_from_data(&rows_split)?;
116
117 let inverse = dict_to_csv(
118 &user_dict.dict,
119 &user_dict.words_idx_data,
120 &user_dict.words_data,
121 &LinderaSerializer,
122 )?;
123
124 assert_eq!(inverse[0], rows[0]);
125 assert_eq!(inverse[1], rows[2]);
126 assert_eq!(inverse[2], rows[1]);
127 Ok(())
128 }
129}