Skip to main content

lindera_dictionary/dictionary/
metadata.rs

1use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
2use serde::{Deserialize, Serialize};
3
4use crate::dictionary::schema::Schema;
5
6const DEFAULT_WORD_COST: i16 = -10000;
7const DEFAULT_LEFT_CONTEXT_ID: u16 = 1288;
8const DEFAULT_RIGHT_CONTEXT_ID: u16 = 1288;
9const DEFAULT_FIELD_VALUE: &str = "*";
10
11#[derive(Clone, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize)]
12
13pub struct ModelInfo {
14    pub feature_count: usize,
15    pub label_count: usize,
16    pub max_left_context_id: usize,
17    pub max_right_context_id: usize,
18    pub connection_matrix_size: String,
19    pub version: String,
20    pub training_iterations: u64,
21    pub regularization: f64,
22    pub updated_at: u64,
23}
24
25#[derive(Clone, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize)]
26
27pub struct Metadata {
28    pub name: String,                   // Name of the dictionary
29    pub encoding: String,               // Character encoding
30    pub default_word_cost: i16,         // Word cost for simple user dictionary
31    pub default_left_context_id: u16,   // Context ID for simple user dictionary
32    pub default_right_context_id: u16,  // Context ID for simple user dictionary
33    pub default_field_value: String,    // Default value for fields in simple user dictionary
34    pub flexible_csv: bool,             // Handle CSV columns flexibly
35    pub skip_invalid_cost_or_id: bool,  // Skip invalid cost or ID
36    pub normalize_details: bool,        // Normalize characters
37    pub dictionary_schema: Schema,      // Schema for the dictionary
38    pub user_dictionary_schema: Schema, // Schema for user dictionary
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub model_info: Option<ModelInfo>, // Training model information (optional)
41}
42
43impl Default for Metadata {
44    fn default() -> Self {
45        // Default metadata values can be adjusted as needed
46        Metadata::new(
47            "default".to_string(),
48            "UTF-8".to_string(),
49            DEFAULT_WORD_COST,
50            DEFAULT_LEFT_CONTEXT_ID,
51            DEFAULT_RIGHT_CONTEXT_ID,
52            DEFAULT_FIELD_VALUE.to_string(),
53            false,
54            false,
55            false,
56            Schema::default(),
57            Schema::new(vec![
58                "surface".to_string(),
59                "reading".to_string(),
60                "pronunciation".to_string(),
61            ]),
62        )
63    }
64}
65
66impl Metadata {
67    #[allow(clippy::too_many_arguments)]
68    pub fn new(
69        name: String,
70        encoding: String,
71        simple_word_cost: i16,
72        default_left_context_id: u16,
73        default_right_context_id: u16,
74        default_field_value: String,
75        flexible_csv: bool,
76        skip_invalid_cost_or_id: bool,
77        normalize_details: bool,
78        schema: Schema,
79        userdic_schema: Schema,
80    ) -> Self {
81        Self {
82            encoding,
83            default_word_cost: simple_word_cost,
84            default_left_context_id,
85            default_right_context_id,
86            default_field_value,
87            dictionary_schema: schema,
88            name,
89            flexible_csv,
90            skip_invalid_cost_or_id,
91            normalize_details,
92            user_dictionary_schema: userdic_schema,
93            model_info: None,
94        }
95    }
96
97    /// Load metadata from binary data (JSON format).
98    /// This provides a consistent interface with other dictionary components.
99    pub fn load(data: &[u8]) -> crate::LinderaResult<Self> {
100        // If data is empty, return an error since metadata is required
101        if data.is_empty() {
102            return Err(crate::error::LinderaErrorKind::Io
103                .with_error(anyhow::anyhow!("Empty metadata data")));
104        }
105
106        // Deserialize as JSON
107        serde_json::from_slice(data).map_err(|err| {
108            crate::error::LinderaErrorKind::Deserialize
109                .with_error(anyhow::anyhow!(err))
110                .add_context("Failed to deserialize metadata from JSON")
111        })
112    }
113
114    /// Load metadata with fallback to default values.
115    /// This is used when feature flags are disabled and data might be empty.
116    pub fn load_or_default(data: &[u8], default_fn: fn() -> Self) -> Self {
117        if data.is_empty() {
118            default_fn()
119        } else {
120            match Self::load(data) {
121                Ok(metadata) => metadata,
122                Err(_) => default_fn(),
123            }
124        }
125    }
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131
132    #[test]
133    fn test_metadata_default() {
134        let metadata = Metadata::default();
135        assert_eq!(metadata.name, "default");
136        // Schema no longer has name field
137    }
138
139    #[test]
140    fn test_metadata_new() {
141        let schema = Schema::default();
142        let metadata = Metadata::new(
143            "TestDict".to_string(),
144            "UTF-8".to_string(),
145            -10000,
146            0,
147            0,
148            "*".to_string(),
149            false,
150            false,
151            false,
152            schema.clone(),
153            Schema::new(vec!["surface".to_string(), "reading".to_string()]),
154        );
155        assert_eq!(metadata.name, "TestDict");
156        // Schema no longer has name field
157    }
158
159    #[test]
160    fn test_metadata_serialization() {
161        let metadata = Metadata::default();
162
163        // Test serialization
164        let serialized = serde_json::to_string(&metadata).unwrap();
165        assert!(serialized.contains("default"));
166        assert!(serialized.contains("schema"));
167        assert!(serialized.contains("name"));
168
169        // Test deserialization
170        let deserialized: Metadata = serde_json::from_str(&serialized).unwrap();
171        assert_eq!(deserialized.name, "default");
172        // Schema no longer has name field
173    }
174}