lindera_dictionary/dictionary/
metadata.rs

1use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
2use serde::{Deserialize, Serialize};
3
4use crate::decompress::Algorithm;
5use crate::dictionary::schema::Schema;
6
7const DEFAULT_COMPRESS_ALGORITHM: Algorithm = Algorithm::Deflate;
8const DEFAULT_WORD_COST: i16 = -10000;
9const DEFAULT_LEFT_CONTEXT_ID: u16 = 1288;
10const DEFAULT_RIGHT_CONTEXT_ID: u16 = 1288;
11const DEFAULT_FIELD_VALUE: &str = "*";
12
13#[derive(Clone, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize)]
14
15pub struct ModelInfo {
16    pub feature_count: usize,
17    pub label_count: usize,
18    pub max_left_context_id: usize,
19    pub max_right_context_id: usize,
20    pub connection_matrix_size: String,
21    pub version: String,
22    pub training_iterations: u64,
23    pub regularization: f64,
24    pub updated_at: u64,
25}
26
27#[derive(Clone, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize)]
28
29pub struct Metadata {
30    pub name: String,                   // Name of the dictionary
31    pub encoding: String,               // Character encoding
32    pub compress_algorithm: Algorithm,  // Compression algorithm
33    pub default_word_cost: i16,         // Word cost for simple user dictionary
34    pub default_left_context_id: u16,   // Context ID for simple user dictionary
35    pub default_right_context_id: u16,  // Context ID for simple user dictionary
36    pub default_field_value: String,    // Default value for fields in simple user dictionary
37    pub flexible_csv: bool,             // Handle CSV columns flexibly
38    pub skip_invalid_cost_or_id: bool,  // Skip invalid cost or ID
39    pub normalize_details: bool,        // Normalize characters
40    pub dictionary_schema: Schema,      // Schema for the dictionary
41    pub user_dictionary_schema: Schema, // Schema for user dictionary
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub model_info: Option<ModelInfo>, // Training model information (optional)
44}
45
46impl Default for Metadata {
47    fn default() -> Self {
48        // Default metadata values can be adjusted as needed
49        Metadata::new(
50            "default".to_string(),
51            "UTF-8".to_string(),
52            DEFAULT_COMPRESS_ALGORITHM,
53            DEFAULT_WORD_COST,
54            DEFAULT_LEFT_CONTEXT_ID,
55            DEFAULT_RIGHT_CONTEXT_ID,
56            DEFAULT_FIELD_VALUE.to_string(),
57            false,
58            false,
59            false,
60            Schema::default(),
61            Schema::new(vec![
62                "surface".to_string(),
63                "reading".to_string(),
64                "pronunciation".to_string(),
65            ]),
66        )
67    }
68}
69
70impl Metadata {
71    #[allow(clippy::too_many_arguments)]
72    pub fn new(
73        name: String,
74        encoding: String,
75        compress_algorithm: Algorithm,
76        simple_word_cost: i16,
77        default_left_context_id: u16,
78        default_right_context_id: u16,
79        default_field_value: String,
80        flexible_csv: bool,
81        skip_invalid_cost_or_id: bool,
82        normalize_details: bool,
83        schema: Schema,
84        userdic_schema: Schema,
85    ) -> Self {
86        Self {
87            encoding,
88            compress_algorithm,
89            default_word_cost: simple_word_cost,
90            default_left_context_id,
91            default_right_context_id,
92            default_field_value,
93            dictionary_schema: schema,
94            name,
95            flexible_csv,
96            skip_invalid_cost_or_id,
97            normalize_details,
98            user_dictionary_schema: userdic_schema,
99            model_info: None,
100        }
101    }
102
103    /// Load metadata from binary data (JSON format or compressed binary format).
104    /// This provides a consistent interface with other dictionary components.
105    pub fn load(data: &[u8]) -> crate::LinderaResult<Self> {
106        // If data is empty, return an error since metadata is required
107        if data.is_empty() {
108            return Err(crate::error::LinderaErrorKind::Io
109                .with_error(anyhow::anyhow!("Empty metadata data")));
110        }
111
112        // Try to deserialize as JSON first (for uncompressed metadata.json files)
113        if let Ok(metadata) = serde_json::from_slice(data) {
114            return Ok(metadata);
115        }
116
117        // If JSON fails, try to decompress as rkyv-encoded compressed data
118        #[cfg(feature = "compress")]
119        {
120            use crate::decompress::{CompressedData, decompress};
121
122            if let Ok(compressed_data) =
123                rkyv::from_bytes::<CompressedData, rkyv::rancor::Error>(data)
124            {
125                if let Ok(decompressed) = decompress(compressed_data) {
126                    // Try to parse the decompressed data as JSON
127                    if let Ok(metadata) = serde_json::from_slice(&decompressed) {
128                        return Ok(metadata);
129                    }
130                }
131            }
132        }
133
134        #[cfg(not(feature = "compress"))]
135        {
136            // Without compress feature, data should be raw JSON
137            return serde_json::from_slice(data).map_err(|err| {
138                crate::error::LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err))
139            });
140        }
141
142        // If all attempts fail, return an error
143        Err(
144            crate::error::LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(
145                "Failed to deserialize metadata from any supported format"
146            )),
147        )
148    }
149
150    /// Load metadata with fallback to default values.
151    /// This is used when feature flags are disabled and data might be empty.
152    pub fn load_or_default(data: &[u8], default_fn: fn() -> Self) -> Self {
153        if data.is_empty() {
154            default_fn()
155        } else {
156            match Self::load(data) {
157                Ok(metadata) => metadata,
158                Err(_) => default_fn(),
159            }
160        }
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_metadata_default() {
170        let metadata = Metadata::default();
171        assert_eq!(metadata.name, "default");
172        // Schema no longer has name field
173    }
174
175    #[test]
176    fn test_metadata_new() {
177        let schema = Schema::default();
178        let metadata = Metadata::new(
179            "TestDict".to_string(),
180            "UTF-8".to_string(),
181            Algorithm::Deflate,
182            -10000,
183            0,
184            0,
185            "*".to_string(),
186            false,
187            false,
188            false,
189            schema.clone(),
190            Schema::new(vec!["surface".to_string(), "reading".to_string()]),
191        );
192        assert_eq!(metadata.name, "TestDict");
193        // Schema no longer has name field
194    }
195
196    #[test]
197    fn test_metadata_serialization() {
198        let metadata = Metadata::default();
199
200        // Test serialization
201        let serialized = serde_json::to_string(&metadata).unwrap();
202        assert!(serialized.contains("default"));
203        assert!(serialized.contains("schema"));
204        assert!(serialized.contains("name"));
205
206        // Test deserialization
207        let deserialized: Metadata = serde_json::from_str(&serialized).unwrap();
208        assert_eq!(deserialized.name, "default");
209        // Schema no longer has name field
210    }
211}