jpreprocess_dictionary_builder/to_csv/
mod.rs

1use crate::build_dict::WordEntryMap;
2use byteorder::{ByteOrder, LittleEndian};
3use jpreprocess_dictionary::DictionarySerializer;
4use lindera_core::{prefix_dict::PrefixDict, word_entry::WordEntry, LinderaResult};
5use std::collections::BTreeMap;
6
7use self::da::DoubleArrayParser;
8
9mod da;
10
11/// Converts dictionary to csv.
12///
13/// The third column (right_id) cannot be recovered
14/// because it is already lost while building the dictionary.
15pub fn dict_to_csv<S: DictionarySerializer>(
16    prefix_dict: &PrefixDict,
17    words_idx_data: &[u8],
18    words_data: &[u8],
19    serializer: &S,
20) -> LinderaResult<Vec<String>> {
21    let word_entry_map = inverse_prefix_dict(prefix_dict, true);
22
23    let rows: Vec<(String, WordEntry)> = word_entry_map
24        .into_iter()
25        .flat_map(|(string, word_entries)| {
26            word_entries
27                .into_iter()
28                .map(move |word_entry| (string.to_owned(), word_entry))
29        })
30        .collect();
31
32    let words: Vec<String> = rows.iter().map(|w| w.0.to_owned()).collect();
33
34    Ok(rows
35        .into_iter()
36        .zip(words_to_csv(words_idx_data, words_data, words, serializer)?)
37        .map(|((string, word_entry), right)| {
38            format!(
39                "{},{},{},{},{}",
40                string, word_entry.left_id, word_entry.right_id, word_entry.word_cost, right
41            )
42        })
43        .collect())
44}
45
46/// Converts prefix dict to WordEntry map.
47///
48/// This is considered to be inverse of build_prefix_dict,
49/// and no data is expected to be lost.
50pub fn inverse_prefix_dict(prefix_dict: &PrefixDict, is_system: bool) -> WordEntryMap {
51    let mut result: WordEntryMap = BTreeMap::new();
52
53    let keyset = DoubleArrayParser(&prefix_dict.da.0).inverse_da();
54    for (s, offset_len) in keyset {
55        let len = offset_len & 0x1f;
56        let offset = offset_len >> 5;
57        let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
58        let data: &[u8] = &prefix_dict.vals_data[offset_bytes..];
59        result.insert(
60            s,
61            (0..len as usize)
62                .map(move |i| {
63                    WordEntry::deserialize(&data[WordEntry::SERIALIZED_LEN * i..], is_system)
64                })
65                .collect(),
66        );
67    }
68
69    result
70}
71
72/// Converts words data to csv.
73///
74/// Note that some data (e.g. orig) is already lost in dictionary build process
75/// if jpreprocess serlializer is used.
76pub fn words_to_csv<S: DictionarySerializer>(
77    words_idx_data: &[u8],
78    words_data: &[u8],
79    words: Vec<String>,
80    serializer: &S,
81) -> LinderaResult<Vec<String>> {
82    let words_count = words_idx_data.len() / 4;
83    assert_eq!(words_count, words.len());
84
85    let mut result = vec![];
86    for (i, word) in words.into_iter().enumerate() {
87        let idx = LittleEndian::read_u32(&words_idx_data[i * 4..(i + 1) * 4]) as usize;
88        let deserialized = serializer.deserialize_with_string(&words_data[idx..], word)?;
89        result.push(deserialized);
90    }
91    Ok(result)
92}
93
94#[cfg(test)]
95mod tests {
96    use std::error::Error;
97
98    use jpreprocess_dictionary::serializer::lindera::LinderaSerializer;
99
100    use crate::ipadic_builder::IpadicBuilder;
101
102    use super::dict_to_csv;
103
104    #[test]
105    fn inverse() -> Result<(), Box<dyn Error>> {
106        let rows=[
107            "キログラム,1360,1360,7944,名詞,接尾,助数詞,*,*,*,キログラム,キログラム,キログラム,3/5,C1,-1",
108            "生麦生米生卵,3,3,10000,感動詞,*,*,*,*,*,生麦:生米:生卵,ナマムギ:ナマゴメ:ナマタマゴ,ナマムギ:ナマゴメ:ナマタマゴ,2/4:2/4:3/5,*,-1",
109            "日本,1354,1354,10787,名詞,固有名詞,地域,国,*,*,日本,ニホン,ニホン,2/3,C1,-1"
110        ];
111
112        let rows_split: Vec<Vec<&str>> = rows.map(|s| s.split(',').collect()).to_vec();
113
114        let builder = IpadicBuilder::new(Box::new(LinderaSerializer));
115        let user_dict = builder.build_user_dict_from_data(&rows_split)?;
116
117        let inverse = dict_to_csv(
118            &user_dict.dict,
119            &user_dict.words_idx_data,
120            &user_dict.words_data,
121            &LinderaSerializer,
122        )?;
123
124        assert_eq!(inverse[0], rows[0]);
125        assert_eq!(inverse[1], rows[2]);
126        assert_eq!(inverse[2], rows[1]);
127        Ok(())
128    }
129}