jpreprocess_dictionary/dictionary/
word_encoding.rs

1use byteorder::{ByteOrder, LittleEndian};
2use jpreprocess_core::word_line::WordDetailsLine;
3use lindera_dictionary::{error::LinderaErrorKind, LinderaResult};
4
5/// A trait for encoding and decoding as dictionary entry.
6pub trait DictionaryWordEncoding: Sized {
7    fn identifier() -> &'static str;
8    fn encode(row: WordDetailsLine) -> LinderaResult<Vec<u8>>;
9    fn decode(string: String, details: &[u8]) -> LinderaResult<Vec<String>>;
10}
11
12pub struct JPreprocessDictionaryWordEncoding;
13impl JPreprocessDictionaryWordEncoding {
14    pub fn serialize(
15        data: &jpreprocess_core::word_entry::WordEntry,
16    ) -> Result<Vec<u8>, bincode::error::EncodeError> {
17        bincode::serde::encode_to_vec(data, Self::bincode_option())
18    }
19    pub fn deserialize(
20        data: &[u8],
21    ) -> Result<jpreprocess_core::word_entry::WordEntry, bincode::error::DecodeError> {
22        let (decoded, _size) = bincode::serde::decode_from_slice(data, Self::bincode_option())?;
23        Ok(decoded)
24    }
25
26    fn bincode_option() -> bincode::config::Configuration {
27        bincode::config::standard()
28            .with_no_limit()
29            .with_little_endian()
30    }
31}
32impl DictionaryWordEncoding for JPreprocessDictionaryWordEncoding {
33    fn identifier() -> &'static str {
34        concat!("jpreprocess ", env!("CARGO_PKG_VERSION"))
35    }
36
37    fn encode(row: WordDetailsLine) -> LinderaResult<Vec<u8>> {
38        let data = row
39            .try_into()
40            .map_err(|err| LinderaErrorKind::Serialize.with_error(err))?;
41        Self::serialize(&data).map_err(|err| LinderaErrorKind::Serialize.with_error(err))
42    }
43
44    fn decode(string: String, data: &[u8]) -> LinderaResult<Vec<String>> {
45        let word_details: jpreprocess_core::word_entry::WordEntry =
46            Self::deserialize(data).map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
47        Ok(word_details.to_str_vec(string).to_vec())
48    }
49}
50
51pub struct LinderaSystemDictionaryWordEncoding;
52impl DictionaryWordEncoding for LinderaSystemDictionaryWordEncoding {
53    fn identifier() -> &'static str {
54        unimplemented!("JPreprocess does not support building in Lindera dictionary format")
55    }
56
57    fn encode(_row: WordDetailsLine) -> LinderaResult<Vec<u8>> {
58        unimplemented!("JPreprocess does not support building in Lindera dictionary format")
59    }
60
61    fn decode(_string: String, data: &[u8]) -> LinderaResult<Vec<String>> {
62        let len = LittleEndian::read_u32(data) as usize;
63        let data = &data[4..4 + len];
64
65        let mut details = Vec::new();
66        for bytes in data.split(|&b| b == 0) {
67            let detail = match std::str::from_utf8(bytes) {
68                Ok(s) => s,
69                Err(err) => return Err(LinderaErrorKind::Deserialize.with_error(err)),
70            };
71            details.push(detail.to_string());
72        }
73        Ok(details)
74    }
75}
76
77pub struct LinderaUserDictionaryWordEncoding;
78impl DictionaryWordEncoding for LinderaUserDictionaryWordEncoding {
79    fn identifier() -> &'static str {
80        unimplemented!("JPreprocess does not support building in Lindera dictionary format")
81    }
82
83    fn encode(_row: WordDetailsLine) -> LinderaResult<Vec<u8>> {
84        unimplemented!("JPreprocess does not support building in Lindera dictionary format")
85    }
86
87    fn decode(_string: String, data: &[u8]) -> LinderaResult<Vec<String>> {
88        LinderaSystemDictionaryWordEncoding::decode(_string, data)
89    }
90}