jpreprocess_dictionary/dictionary/to_csv/
mod.rs

1use lindera_dictionary::{
2    dictionary::prefix_dictionary::PrefixDictionary, viterbi::WordEntry, LinderaResult,
3};
4use std::collections::BTreeMap;
5
6use crate::word_data::get_word_data;
7
8use self::da::DoubleArrayParser;
9
10use super::{word_encoding::DictionaryWordEncoding, WordEntryMap};
11
12mod da;
13
14/// Converts prefix dictionary back to csv.
15pub fn dict_to_csv<E: DictionaryWordEncoding>(
16    prefix_dictionary: &PrefixDictionary,
17) -> LinderaResult<Vec<String>> {
18    let word_entry_map = inverse_prefix_dict(prefix_dictionary, true);
19
20    let rows: Vec<(String, WordEntry)> = word_entry_map
21        .into_iter()
22        .flat_map(|(string, word_entries)| {
23            word_entries
24                .into_iter()
25                .map(move |word_entry| (string.to_owned(), word_entry))
26        })
27        .collect();
28
29    Ok(rows
30        .into_iter()
31        .map(|(string, word_entry)| {
32            let word_data = get_word_data(
33                &prefix_dictionary.words_idx_data,
34                &prefix_dictionary.words_data,
35                Some(word_entry.word_id.id as usize),
36            )
37            .unwrap();
38            let details = E::decode(string.clone(), word_data).unwrap();
39
40            format!(
41                "{},{},{},{},{}",
42                string,
43                word_entry.left_id,
44                word_entry.right_id,
45                word_entry.word_cost,
46                details.join(",")
47            )
48        })
49        .collect())
50}
51
52/// Converts prefix dict to WordEntry map.
53///
54/// This is considered to be inverse of build_prefix_dict,
55/// and no data is expected to be lost.
56pub fn inverse_prefix_dict(prefix_dictionary: &PrefixDictionary, is_system: bool) -> WordEntryMap {
57    let mut result: WordEntryMap = BTreeMap::new();
58
59    let keyset = DoubleArrayParser(&prefix_dictionary.da.0).inverse_da();
60    for (s, offset_len) in keyset {
61        let len = offset_len & 0x1f;
62        let offset = offset_len >> 5;
63        let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
64        let data: &[u8] = &prefix_dictionary.vals_data[offset_bytes..];
65        result.insert(
66            s,
67            (0..len as usize)
68                .map(move |i| {
69                    WordEntry::deserialize(&data[WordEntry::SERIALIZED_LEN * i..], is_system)
70                })
71                .collect(),
72        );
73    }
74
75    result
76}
77
78#[cfg(test)]
79mod tests {
80    use crate::dictionary::word_encoding::{
81        JPreprocessDictionaryWordEncoding, LinderaUserDictionaryWordEncoding,
82    };
83
84    use super::dict_to_csv;
85    use std::{error::Error, path::PathBuf};
86
87    #[test]
88    fn inverse_lindera() -> Result<(), Box<dyn Error>> {
89        let input_file = PathBuf::from("./test.csv");
90
91        let builder =
92            lindera_dictionary::builder::user_dictionary::UserDictionaryBuilderOptions::default()
93                .builder()
94                .unwrap();
95        let user_dict = builder.build(&input_file).unwrap();
96
97        let inverse = dict_to_csv::<LinderaUserDictionaryWordEncoding>(&user_dict.dict)?;
98
99        let input_content = std::fs::read_to_string(input_file).unwrap();
100        let rows = input_content.split('\n').collect::<Vec<_>>();
101
102        assert_eq!(inverse[0], rows[0]);
103        assert_eq!(inverse[1], rows[2]);
104        assert_eq!(inverse[2], rows[1]);
105        Ok(())
106    }
107
108    #[test]
109    fn inverse_jpreprocess() -> Result<(), Box<dyn Error>> {
110        let input_file = PathBuf::from("./test.csv");
111
112        let builder = crate::dictionary::to_dict::JPreprocessDictionaryBuilder::default();
113        let user_dict = builder.build_user_dict(&input_file).unwrap();
114
115        let inverse = dict_to_csv::<JPreprocessDictionaryWordEncoding>(&user_dict.dict)?;
116
117        let input_content = std::fs::read_to_string(input_file).unwrap();
118        let rows = input_content.split('\n').collect::<Vec<_>>();
119
120        assert_eq!(inverse[0], rows[0]);
121        assert_eq!(inverse[1], rows[2]);
122        assert_eq!(inverse[2], rows[1]);
123        Ok(())
124    }
125}