lindera_dictionary/dictionary/
metadata.rs

1use serde::{Deserialize, Serialize};
2
3use crate::decompress::Algorithm;
4use crate::dictionary::schema::Schema;
5
6const DEFAULT_COMPRESS_ALGORITHM: Algorithm = Algorithm::Deflate;
7const DEFAULT_WORD_COST: i16 = -10000;
8const DEFAULT_LEFT_CONTEXT_ID: u16 = 1288;
9const DEFAULT_RIGHT_CONTEXT_ID: u16 = 1288;
10const DEFAULT_FIELD_VALUE: &str = "*";
11
12#[derive(Clone, Serialize, Deserialize)]
13pub struct ModelInfo {
14    pub feature_count: usize,
15    pub label_count: usize,
16    pub max_left_context_id: usize,
17    pub max_right_context_id: usize,
18    pub connection_matrix_size: String,
19    pub version: String,
20    pub training_iterations: u64,
21    pub regularization: f64,
22    pub updated_at: u64,
23}
24
25#[derive(Clone, Serialize, Deserialize)]
26pub struct Metadata {
27    pub name: String,                   // Name of the dictionary
28    pub encoding: String,               // Character encoding
29    pub compress_algorithm: Algorithm,  // Compression algorithm
30    pub default_word_cost: i16,         // Word cost for simple user dictionary
31    pub default_left_context_id: u16,   // Context ID for simple user dictionary
32    pub default_right_context_id: u16,  // Context ID for simple user dictionary
33    pub default_field_value: String,    // Default value for fields in simple user dictionary
34    pub flexible_csv: bool,             // Handle CSV columns flexibly
35    pub skip_invalid_cost_or_id: bool,  // Skip invalid cost or ID
36    pub normalize_details: bool,        // Normalize characters
37    pub dictionary_schema: Schema,      // Schema for the dictionary
38    pub user_dictionary_schema: Schema, // Schema for user dictionary
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub model_info: Option<ModelInfo>, // Training model information (optional)
41}
42
43impl Default for Metadata {
44    fn default() -> Self {
45        // Default metadata values can be adjusted as needed
46        Metadata::new(
47            "default".to_string(),
48            "UTF-8".to_string(),
49            DEFAULT_COMPRESS_ALGORITHM,
50            DEFAULT_WORD_COST,
51            DEFAULT_LEFT_CONTEXT_ID,
52            DEFAULT_RIGHT_CONTEXT_ID,
53            DEFAULT_FIELD_VALUE.to_string(),
54            false,
55            false,
56            false,
57            Schema::default(),
58            Schema::new(vec![
59                "surface".to_string(),
60                "reading".to_string(),
61                "pronunciation".to_string(),
62            ]),
63        )
64    }
65}
66
67impl Metadata {
68    #[allow(clippy::too_many_arguments)]
69    pub fn new(
70        name: String,
71        encoding: String,
72        compress_algorithm: Algorithm,
73        simple_word_cost: i16,
74        default_left_context_id: u16,
75        default_right_context_id: u16,
76        default_field_value: String,
77        flexible_csv: bool,
78        skip_invalid_cost_or_id: bool,
79        normalize_details: bool,
80        schema: Schema,
81        userdic_schema: Schema,
82    ) -> Self {
83        Self {
84            encoding,
85            compress_algorithm,
86            default_word_cost: simple_word_cost,
87            default_left_context_id,
88            default_right_context_id,
89            default_field_value,
90            dictionary_schema: schema,
91            name,
92            flexible_csv,
93            skip_invalid_cost_or_id,
94            normalize_details,
95            user_dictionary_schema: userdic_schema,
96            model_info: None,
97        }
98    }
99
100    /// Load metadata from binary data (JSON format or compressed binary format).
101    /// This provides a consistent interface with other dictionary components.
102    pub fn load(data: &[u8]) -> crate::LinderaResult<Self> {
103        // If data is empty, return an error since metadata is required
104        if data.is_empty() {
105            return Err(crate::error::LinderaErrorKind::Io
106                .with_error(anyhow::anyhow!("Empty metadata data")));
107        }
108
109        // Try to deserialize as JSON first (for uncompressed metadata.json files)
110        if let Ok(metadata) = serde_json::from_slice(data) {
111            return Ok(metadata);
112        }
113
114        // If JSON fails, try to decompress as bincode-encoded compressed data
115        #[cfg(feature = "compress")]
116        {
117            use crate::decompress::{CompressedData, decompress};
118
119            if let Ok((compressed_data, _)) = bincode::serde::decode_from_slice::<CompressedData, _>(
120                data,
121                bincode::config::legacy(),
122            ) {
123                if let Ok(decompressed) = decompress(compressed_data) {
124                    // Try to parse the decompressed data as JSON
125                    if let Ok(metadata) = serde_json::from_slice(&decompressed) {
126                        return Ok(metadata);
127                    }
128                }
129            }
130        }
131
132        #[cfg(not(feature = "compress"))]
133        {
134            // Without compress feature, data should be raw JSON
135            return serde_json::from_slice(data).map_err(|err| {
136                crate::error::LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err))
137            });
138        }
139
140        // If all attempts fail, return an error
141        Err(
142            crate::error::LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(
143                "Failed to deserialize metadata from any supported format"
144            )),
145        )
146    }
147
148    /// Load metadata with fallback to default values.
149    /// This is used when feature flags are disabled and data might be empty.
150    pub fn load_or_default(data: &[u8], default_fn: fn() -> Self) -> Self {
151        if data.is_empty() {
152            default_fn()
153        } else {
154            match Self::load(data) {
155                Ok(metadata) => metadata,
156                Err(_) => default_fn(),
157            }
158        }
159    }
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn test_metadata_default() {
168        let metadata = Metadata::default();
169        assert_eq!(metadata.name, "default");
170        // Schema no longer has name field
171    }
172
173    #[test]
174    fn test_metadata_new() {
175        let schema = Schema::default();
176        let metadata = Metadata::new(
177            "TestDict".to_string(),
178            "UTF-8".to_string(),
179            Algorithm::Deflate,
180            -10000,
181            0,
182            0,
183            "*".to_string(),
184            false,
185            false,
186            false,
187            schema.clone(),
188            Schema::new(vec!["surface".to_string(), "reading".to_string()]),
189        );
190        assert_eq!(metadata.name, "TestDict");
191        // Schema no longer has name field
192    }
193
194    #[test]
195    fn test_metadata_serialization() {
196        let metadata = Metadata::default();
197
198        // Test serialization
199        let serialized = serde_json::to_string(&metadata).unwrap();
200        assert!(serialized.contains("default"));
201        assert!(serialized.contains("schema"));
202        assert!(serialized.contains("name"));
203
204        // Test deserialization
205        let deserialized: Metadata = serde_json::from_str(&serialized).unwrap();
206        assert_eq!(deserialized.name, "default");
207        // Schema no longer has name field
208    }
209}