lindera_dictionary/dictionary/
metadata.rs

1use serde::{Deserialize, Serialize};
2
3use crate::decompress::Algorithm;
4use crate::dictionary::schema::Schema;
5
6const DEFAULT_COMPRESS_ALGORITHM: Algorithm = Algorithm::Deflate;
7const DEFAULT_WORD_COST: i16 = -10000;
8const DEFAULT_LEFT_CONTEXT_ID: u16 = 1288;
9const DEFAULT_RIGHT_CONTEXT_ID: u16 = 1288;
10const DEFAULT_FIELD_VALUE: &str = "*";
11
12#[derive(Clone, Serialize, Deserialize)]
13pub struct Metadata {
14    pub name: String,                   // Name of the dictionary
15    pub encoding: String,               // Character encoding
16    pub compress_algorithm: Algorithm,  // Compression algorithm
17    pub default_word_cost: i16,         // Word cost for simple user dictionary
18    pub default_left_context_id: u16,   // Context ID for simple user dictionary
19    pub default_right_context_id: u16,  // Context ID for simple user dictionary
20    pub default_field_value: String,    // Default value for fields in simple user dictionary
21    pub flexible_csv: bool,             // Handle CSV columns flexibly
22    pub skip_invalid_cost_or_id: bool,  // Skip invalid cost or ID
23    pub normalize_details: bool,        // Normalize characters
24    pub dictionary_schema: Schema,      // Schema for the dictionary
25    pub user_dictionary_schema: Schema, // Schema for user dictionary
26}
27
28impl Default for Metadata {
29    fn default() -> Self {
30        // Default metadata values can be adjusted as needed
31        Metadata::new(
32            "default".to_string(),
33            "UTF-8".to_string(),
34            DEFAULT_COMPRESS_ALGORITHM,
35            DEFAULT_WORD_COST,
36            DEFAULT_LEFT_CONTEXT_ID,
37            DEFAULT_RIGHT_CONTEXT_ID,
38            DEFAULT_FIELD_VALUE.to_string(),
39            false,
40            false,
41            false,
42            Schema::default(),
43            Schema::new(vec![
44                "surface".to_string(),
45                "reading".to_string(),
46                "pronunciation".to_string(),
47            ]),
48        )
49    }
50}
51
52impl Metadata {
53    #[allow(clippy::too_many_arguments)]
54    pub fn new(
55        name: String,
56        encoding: String,
57        compress_algorithm: Algorithm,
58        simple_word_cost: i16,
59        default_left_context_id: u16,
60        default_right_context_id: u16,
61        default_field_value: String,
62        flexible_csv: bool,
63        skip_invalid_cost_or_id: bool,
64        normalize_details: bool,
65        schema: Schema,
66        userdic_schema: Schema,
67    ) -> Self {
68        Self {
69            encoding,
70            compress_algorithm,
71            default_word_cost: simple_word_cost,
72            default_left_context_id,
73            default_right_context_id,
74            default_field_value,
75            dictionary_schema: schema,
76            name,
77            flexible_csv,
78            skip_invalid_cost_or_id,
79            normalize_details,
80            user_dictionary_schema: userdic_schema,
81        }
82    }
83
84    /// Load metadata from binary data (JSON format or compressed binary format).
85    /// This provides a consistent interface with other dictionary components.
86    pub fn load(data: &[u8]) -> crate::LinderaResult<Self> {
87        // If data is empty, return an error since metadata is required
88        if data.is_empty() {
89            return Err(crate::error::LinderaErrorKind::Io
90                .with_error(anyhow::anyhow!("Empty metadata data")));
91        }
92
93        // Try to deserialize as JSON first (for uncompressed metadata.json files)
94        if let Ok(metadata) = serde_json::from_slice(data) {
95            return Ok(metadata);
96        }
97
98        // If JSON fails, try to decompress as bincode-encoded compressed data
99        #[cfg(feature = "compress")]
100        {
101            use crate::decompress::{CompressedData, decompress};
102
103            if let Ok((compressed_data, _)) = bincode::serde::decode_from_slice::<CompressedData, _>(
104                data,
105                bincode::config::legacy(),
106            ) {
107                if let Ok(decompressed) = decompress(compressed_data) {
108                    // Try to parse the decompressed data as JSON
109                    if let Ok(metadata) = serde_json::from_slice(&decompressed) {
110                        return Ok(metadata);
111                    }
112                }
113            }
114        }
115
116        #[cfg(not(feature = "compress"))]
117        {
118            // Without compress feature, data should be raw JSON
119            return serde_json::from_slice(data).map_err(|err| {
120                crate::error::LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err))
121            });
122        }
123
124        // If all attempts fail, return an error
125        Err(
126            crate::error::LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(
127                "Failed to deserialize metadata from any supported format"
128            )),
129        )
130    }
131
132    /// Load metadata with fallback to default values.
133    /// This is used when feature flags are disabled and data might be empty.
134    pub fn load_or_default(data: &[u8], default_fn: fn() -> Self) -> Self {
135        if data.is_empty() {
136            default_fn()
137        } else {
138            match Self::load(data) {
139                Ok(metadata) => metadata,
140                Err(_) => default_fn(),
141            }
142        }
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn test_metadata_default() {
152        let metadata = Metadata::default();
153        assert_eq!(metadata.name, "default");
154        // Schema no longer has name field
155    }
156
157    #[test]
158    fn test_metadata_new() {
159        let schema = Schema::default();
160        let metadata = Metadata::new(
161            "TestDict".to_string(),
162            "UTF-8".to_string(),
163            Algorithm::Deflate,
164            -10000,
165            0,
166            0,
167            "*".to_string(),
168            false,
169            false,
170            false,
171            schema.clone(),
172            Schema::new(vec!["surface".to_string(), "reading".to_string()]),
173        );
174        assert_eq!(metadata.name, "TestDict");
175        // Schema no longer has name field
176    }
177
178    #[test]
179    fn test_metadata_serialization() {
180        let metadata = Metadata::default();
181
182        // Test serialization
183        let serialized = serde_json::to_string(&metadata).unwrap();
184        assert!(serialized.contains("default"));
185        assert!(serialized.contains("schema"));
186        assert!(serialized.contains("name"));
187
188        // Test deserialization
189        let deserialized: Metadata = serde_json::from_str(&serialized).unwrap();
190        assert_eq!(deserialized.name, "default");
191        // Schema no longer has name field
192    }
193}