jpreprocess_dictionary/dictionary/to_csv/
mod.rs1use lindera_dictionary::{
2 dictionary::prefix_dictionary::PrefixDictionary, viterbi::WordEntry, LinderaResult,
3};
4use std::collections::BTreeMap;
5
6use crate::word_data::get_word_data;
7
8use self::da::DoubleArrayParser;
9
10use super::{word_encoding::DictionaryWordEncoding, WordEntryMap};
11
12mod da;
13
14pub fn dict_to_csv<E: DictionaryWordEncoding>(
16 prefix_dictionary: &PrefixDictionary,
17) -> LinderaResult<Vec<String>> {
18 let word_entry_map = inverse_prefix_dict(prefix_dictionary, true);
19
20 let rows: Vec<(String, WordEntry)> = word_entry_map
21 .into_iter()
22 .flat_map(|(string, word_entries)| {
23 word_entries
24 .into_iter()
25 .map(move |word_entry| (string.to_owned(), word_entry))
26 })
27 .collect();
28
29 Ok(rows
30 .into_iter()
31 .map(|(string, word_entry)| {
32 let word_data = get_word_data(
33 &prefix_dictionary.words_idx_data,
34 &prefix_dictionary.words_data,
35 Some(word_entry.word_id.id as usize),
36 )
37 .unwrap();
38 let details = E::decode(string.clone(), word_data).unwrap();
39
40 format!(
41 "{},{},{},{},{}",
42 string,
43 word_entry.left_id,
44 word_entry.right_id,
45 word_entry.word_cost,
46 details.join(",")
47 )
48 })
49 .collect())
50}
51
52pub fn inverse_prefix_dict(prefix_dictionary: &PrefixDictionary, is_system: bool) -> WordEntryMap {
57 let mut result: WordEntryMap = BTreeMap::new();
58
59 let keyset = DoubleArrayParser(&prefix_dictionary.da.0).inverse_da();
60 for (s, offset_len) in keyset {
61 let len = offset_len & 0x1f;
62 let offset = offset_len >> 5;
63 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
64 let data: &[u8] = &prefix_dictionary.vals_data[offset_bytes..];
65 result.insert(
66 s,
67 (0..len as usize)
68 .map(move |i| {
69 WordEntry::deserialize(&data[WordEntry::SERIALIZED_LEN * i..], is_system)
70 })
71 .collect(),
72 );
73 }
74
75 result
76}
77
78#[cfg(test)]
79mod tests {
80 use crate::dictionary::word_encoding::{
81 JPreprocessDictionaryWordEncoding, LinderaUserDictionaryWordEncoding,
82 };
83
84 use super::dict_to_csv;
85 use std::{error::Error, path::PathBuf};
86
87 #[test]
88 fn inverse_lindera() -> Result<(), Box<dyn Error>> {
89 let input_file = PathBuf::from("./test.csv");
90
91 let builder =
92 lindera_dictionary::builder::user_dictionary::UserDictionaryBuilderOptions::default()
93 .builder()
94 .unwrap();
95 let user_dict = builder.build(&input_file).unwrap();
96
97 let inverse = dict_to_csv::<LinderaUserDictionaryWordEncoding>(&user_dict.dict)?;
98
99 let input_content = std::fs::read_to_string(input_file).unwrap();
100 let rows = input_content.split('\n').collect::<Vec<_>>();
101
102 assert_eq!(inverse[0], rows[0]);
103 assert_eq!(inverse[1], rows[2]);
104 assert_eq!(inverse[2], rows[1]);
105 Ok(())
106 }
107
108 #[test]
109 fn inverse_jpreprocess() -> Result<(), Box<dyn Error>> {
110 let input_file = PathBuf::from("./test.csv");
111
112 let builder = crate::dictionary::to_dict::JPreprocessDictionaryBuilder::default();
113 let user_dict = builder.build_user_dict(&input_file).unwrap();
114
115 let inverse = dict_to_csv::<JPreprocessDictionaryWordEncoding>(&user_dict.dict)?;
116
117 let input_content = std::fs::read_to_string(input_file).unwrap();
118 let rows = input_content.split('\n').collect::<Vec<_>>();
119
120 assert_eq!(inverse[0], rows[0]);
121 assert_eq!(inverse[1], rows[2]);
122 assert_eq!(inverse[2], rows[1]);
123 Ok(())
124 }
125}