1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
use crate::build_dict::WordEntryMap;
use byteorder::{ByteOrder, LittleEndian};
use jpreprocess_dictionary::DictionarySerializer;
use lindera_core::{prefix_dict::PrefixDict, word_entry::WordEntry, LinderaResult};
use std::collections::BTreeMap;

use self::da::DoubleArrayParser;

mod da;

/// Converts dictionary to csv.
///
/// The third column (right_id) cannot be recovered
/// because it is already lost while building the dictionary.
pub fn dict_to_csv<S: DictionarySerializer>(
    prefix_dict: &PrefixDict,
    words_idx_data: &[u8],
    words_data: &[u8],
    serializer: &S,
) -> LinderaResult<Vec<String>> {
    let word_entry_map = inverse_prefix_dict(prefix_dict, true);

    let rows: Vec<(String, WordEntry)> = word_entry_map
        .into_iter()
        .flat_map(|(string, word_entries)| {
            word_entries
                .into_iter()
                .map(move |word_entry| (string.to_owned(), word_entry))
        })
        .collect();

    let words: Vec<String> = rows.iter().map(|w| w.0.to_owned()).collect();

    Ok(rows
        .into_iter()
        .zip(words_to_csv(words_idx_data, words_data, words, serializer)?)
        .map(|((string, word_entry), right)| {
            format!(
                "{},{},{},{},{}",
                string,
                word_entry.cost_id,
                // Lindera does not use right_id, so assuming that it is same as the left_id
                word_entry.cost_id,
                word_entry.word_cost,
                right
            )
        })
        .collect())
}

/// Converts prefix dict to WordEntry map.
///
/// This is considered to be inverse of build_prefix_dict,
/// and no data is expected to be lost.
pub fn inverse_prefix_dict(prefix_dict: &PrefixDict, is_system: bool) -> WordEntryMap {
    let mut result: WordEntryMap = BTreeMap::new();

    let keyset = DoubleArrayParser(&prefix_dict.da.0).inverse_da();
    for (s, offset_len) in keyset {
        let len = offset_len & 0x1f;
        let offset = offset_len >> 5;
        let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
        let data: &[u8] = &prefix_dict.vals_data[offset_bytes..];
        result.insert(
            s,
            (0..len as usize)
                .map(move |i| {
                    WordEntry::deserialize(&data[WordEntry::SERIALIZED_LEN * i..], is_system)
                })
                .collect(),
        );
    }

    result
}

/// Converts words data to csv.
///
/// Note that some data (e.g. orig) is already lost in dictionary build process
/// if jpreprocess serlializer is used.
pub fn words_to_csv<S: DictionarySerializer>(
    words_idx_data: &[u8],
    words_data: &[u8],
    words: Vec<String>,
    serializer: &S,
) -> LinderaResult<Vec<String>> {
    let words_count = words_idx_data.len() / 4;
    assert_eq!(words_count, words.len());

    let mut result = vec![];
    for (i, word) in words.into_iter().enumerate() {
        let idx = LittleEndian::read_u32(&words_idx_data[i * 4..(i + 1) * 4]) as usize;
        let deserialized = serializer.deserialize_with_string(&words_data[idx..], word)?;
        result.push(deserialized);
    }
    Ok(result)
}

#[cfg(test)]
mod tests {
    use std::error::Error;

    use jpreprocess_dictionary::serializer::lindera::LinderaSerializer;

    use crate::ipadic_builder::IpadicBuilder;

    use super::dict_to_csv;

    #[test]
    fn inverse() -> Result<(), Box<dyn Error>> {
        let rows=[
            "キログラム,1360,1360,7944,名詞,接尾,助数詞,*,*,*,キログラム,キログラム,キログラム,3/5,C1,-1",
            "生麦生米生卵,3,3,10000,感動詞,*,*,*,*,*,生麦:生米:生卵,ナマムギ:ナマゴメ:ナマタマゴ,ナマムギ:ナマゴメ:ナマタマゴ,2/4:2/4:3/5,*,-1",
            "日本,1354,1354,10787,名詞,固有名詞,地域,国,*,*,日本,ニホン,ニホン,2/3,C1,-1"
        ];

        let rows_split: Vec<Vec<&str>> = rows.map(|s| s.split(',').collect()).to_vec();

        let builder = IpadicBuilder::new(Box::new(LinderaSerializer));
        let user_dict = builder.build_user_dict_from_data(&rows_split)?;

        let inverse = dict_to_csv(
            &user_dict.dict,
            &user_dict.words_idx_data,
            &user_dict.words_data,
            &LinderaSerializer,
        )?;

        assert_eq!(inverse[0], rows[0]);
        assert_eq!(inverse[1], rows[2]);
        assert_eq!(inverse[2], rows[1]);
        Ok(())
    }
}