Skip to main content

lindera_ruby/
metadata.rs

1//! Dictionary metadata configuration.
2//!
3//! This module provides structures for configuring dictionary metadata, including
4//! character encodings and schema definitions.
5
6use std::collections::HashMap;
7
8use magnus::prelude::*;
9use magnus::{Error, Ruby, function, method};
10
11use lindera::dictionary::Metadata;
12
13use crate::schema::RbSchema;
14
15/// Dictionary metadata configuration.
16///
17/// Contains all configuration parameters for building and using dictionaries.
18#[magnus::wrap(class = "Lindera::Metadata", free_immediately, size)]
19#[derive(Debug, Clone)]
20pub struct RbMetadata {
21    /// Dictionary name.
22    name: String,
23    /// Character encoding.
24    encoding: String,
25    /// Default cost for unknown words.
26    default_word_cost: i16,
27    /// Default left context ID.
28    default_left_context_id: u16,
29    /// Default right context ID.
30    default_right_context_id: u16,
31    /// Default value for missing fields.
32    default_field_value: String,
33    /// Allow flexible CSV parsing.
34    flexible_csv: bool,
35    /// Skip entries with invalid cost/ID.
36    skip_invalid_cost_or_id: bool,
37    /// Normalize morphological details.
38    normalize_details: bool,
39    /// Schema for main dictionary.
40    dictionary_schema: RbSchema,
41    /// Schema for user dictionary.
42    user_dictionary_schema: RbSchema,
43}
44
45impl RbMetadata {
46    /// Creates a new `RbMetadata` with optional parameters.
47    ///
48    /// # Arguments
49    ///
50    /// All arguments are optional. Default values are used if not provided.
51    ///
52    /// # Returns
53    ///
54    /// A new `RbMetadata` instance.
55    #[allow(clippy::too_many_arguments)]
56    fn new(
57        name: Option<String>,
58        encoding: Option<String>,
59        default_word_cost: Option<i16>,
60        default_left_context_id: Option<u16>,
61        default_right_context_id: Option<u16>,
62        default_field_value: Option<String>,
63        flexible_csv: Option<bool>,
64        skip_invalid_cost_or_id: Option<bool>,
65        normalize_details: Option<bool>,
66    ) -> Self {
67        RbMetadata {
68            name: name.unwrap_or_else(|| "default".to_string()),
69            encoding: encoding.unwrap_or_else(|| "UTF-8".to_string()),
70            default_word_cost: default_word_cost.unwrap_or(-10000),
71            default_left_context_id: default_left_context_id.unwrap_or(1288),
72            default_right_context_id: default_right_context_id.unwrap_or(1288),
73            default_field_value: default_field_value.unwrap_or_else(|| "*".to_string()),
74            flexible_csv: flexible_csv.unwrap_or(false),
75            skip_invalid_cost_or_id: skip_invalid_cost_or_id.unwrap_or(false),
76            normalize_details: normalize_details.unwrap_or(false),
77            dictionary_schema: RbSchema::create_default_internal(),
78            user_dictionary_schema: RbSchema::new_internal(vec![
79                "surface".to_string(),
80                "reading".to_string(),
81                "pronunciation".to_string(),
82            ]),
83        }
84    }
85
86    /// Creates a default metadata instance.
87    ///
88    /// # Returns
89    ///
90    /// A new `RbMetadata` with default values.
91    fn create_default() -> Self {
92        RbMetadata::new(None, None, None, None, None, None, None, None, None)
93    }
94
95    /// Loads metadata from a JSON file.
96    ///
97    /// # Arguments
98    ///
99    /// * `path` - Path to the JSON file.
100    ///
101    /// # Returns
102    ///
103    /// A new `RbMetadata` loaded from the file.
104    fn from_json_file(path: String) -> Result<Self, Error> {
105        let ruby = Ruby::get().expect("Ruby runtime not initialized");
106
107        let json_str = std::fs::read_to_string(&path).map_err(|e| {
108            Error::new(
109                ruby.exception_io_error(),
110                format!("Failed to read file: {e}"),
111            )
112        })?;
113
114        let metadata: Metadata = serde_json::from_str(&json_str).map_err(|e| {
115            Error::new(
116                ruby.exception_arg_error(),
117                format!("Failed to parse JSON: {e}"),
118            )
119        })?;
120
121        Ok(metadata.into())
122    }
123
124    /// Returns the dictionary name.
125    fn name(&self) -> String {
126        self.name.clone()
127    }
128
129    /// Returns the character encoding.
130    fn encoding(&self) -> String {
131        self.encoding.clone()
132    }
133
134    /// Returns the default word cost.
135    fn default_word_cost(&self) -> i16 {
136        self.default_word_cost
137    }
138
139    /// Returns the default left context ID.
140    fn default_left_context_id(&self) -> u16 {
141        self.default_left_context_id
142    }
143
144    /// Returns the default right context ID.
145    fn default_right_context_id(&self) -> u16 {
146        self.default_right_context_id
147    }
148
149    /// Returns the default field value.
150    fn default_field_value(&self) -> String {
151        self.default_field_value.clone()
152    }
153
154    /// Returns whether flexible CSV parsing is enabled.
155    fn flexible_csv(&self) -> bool {
156        self.flexible_csv
157    }
158
159    /// Returns whether invalid cost/ID entries should be skipped.
160    fn skip_invalid_cost_or_id(&self) -> bool {
161        self.skip_invalid_cost_or_id
162    }
163
164    /// Returns whether morphological details should be normalized.
165    fn normalize_details(&self) -> bool {
166        self.normalize_details
167    }
168
169    /// Converts the metadata to a Ruby hash.
170    ///
171    /// # Returns
172    ///
173    /// A HashMap of metadata properties.
174    fn to_hash(&self) -> HashMap<String, String> {
175        let mut dict = HashMap::new();
176        dict.insert("name".to_string(), self.name.clone());
177        dict.insert("encoding".to_string(), self.encoding.clone());
178        dict.insert(
179            "default_word_cost".to_string(),
180            self.default_word_cost.to_string(),
181        );
182        dict.insert(
183            "default_left_context_id".to_string(),
184            self.default_left_context_id.to_string(),
185        );
186        dict.insert(
187            "default_right_context_id".to_string(),
188            self.default_right_context_id.to_string(),
189        );
190        dict.insert(
191            "default_field_value".to_string(),
192            self.default_field_value.clone(),
193        );
194        dict.insert("flexible_csv".to_string(), self.flexible_csv.to_string());
195        dict.insert(
196            "skip_invalid_cost_or_id".to_string(),
197            self.skip_invalid_cost_or_id.to_string(),
198        );
199        dict.insert(
200            "normalize_details".to_string(),
201            self.normalize_details.to_string(),
202        );
203        dict.insert(
204            "dictionary_schema_fields".to_string(),
205            self.dictionary_schema.fields.join(","),
206        );
207        dict.insert(
208            "user_dictionary_schema_fields".to_string(),
209            self.user_dictionary_schema.fields.join(","),
210        );
211        dict
212    }
213
214    /// Returns the string representation of the metadata.
215    fn to_s(&self) -> String {
216        format!(
217            "Metadata(name='{}', encoding='{}')",
218            self.name, self.encoding,
219        )
220    }
221
222    /// Returns the inspect representation of the metadata.
223    fn inspect(&self) -> String {
224        format!(
225            "#<Lindera::Metadata: name='{}', encoding='{}', schema_fields={}>",
226            self.name,
227            self.encoding,
228            self.dictionary_schema.fields.len()
229        )
230    }
231}
232
233impl From<RbMetadata> for Metadata {
234    fn from(metadata: RbMetadata) -> Self {
235        Metadata::new(
236            metadata.name,
237            metadata.encoding,
238            metadata.default_word_cost,
239            metadata.default_left_context_id,
240            metadata.default_right_context_id,
241            metadata.default_field_value,
242            metadata.flexible_csv,
243            metadata.skip_invalid_cost_or_id,
244            metadata.normalize_details,
245            metadata.dictionary_schema.into(),
246            metadata.user_dictionary_schema.into(),
247        )
248    }
249}
250
251impl From<Metadata> for RbMetadata {
252    fn from(metadata: Metadata) -> Self {
253        RbMetadata {
254            name: metadata.name,
255            encoding: metadata.encoding,
256            default_word_cost: metadata.default_word_cost,
257            default_left_context_id: metadata.default_left_context_id,
258            default_right_context_id: metadata.default_right_context_id,
259            default_field_value: metadata.default_field_value,
260            flexible_csv: metadata.flexible_csv,
261            skip_invalid_cost_or_id: metadata.skip_invalid_cost_or_id,
262            normalize_details: metadata.normalize_details,
263            dictionary_schema: metadata.dictionary_schema.into(),
264            user_dictionary_schema: metadata.user_dictionary_schema.into(),
265        }
266    }
267}
268
269/// Defines Metadata class in the given Ruby module.
270///
271/// # Arguments
272///
273/// * `ruby` - Ruby runtime handle.
274/// * `module` - Parent Ruby module.
275///
276/// # Returns
277///
278/// `Ok(())` on success, or a Magnus `Error` on failure.
279pub fn define(ruby: &Ruby, module: &magnus::RModule) -> Result<(), Error> {
280    let metadata_class = module.define_class("Metadata", ruby.class_object())?;
281    metadata_class.define_singleton_method("new", function!(RbMetadata::new, 9))?;
282    metadata_class
283        .define_singleton_method("create_default", function!(RbMetadata::create_default, 0))?;
284    metadata_class
285        .define_singleton_method("from_json_file", function!(RbMetadata::from_json_file, 1))?;
286    metadata_class.define_method("name", method!(RbMetadata::name, 0))?;
287    metadata_class.define_method("encoding", method!(RbMetadata::encoding, 0))?;
288    metadata_class.define_method(
289        "default_word_cost",
290        method!(RbMetadata::default_word_cost, 0),
291    )?;
292    metadata_class.define_method(
293        "default_left_context_id",
294        method!(RbMetadata::default_left_context_id, 0),
295    )?;
296    metadata_class.define_method(
297        "default_right_context_id",
298        method!(RbMetadata::default_right_context_id, 0),
299    )?;
300    metadata_class.define_method(
301        "default_field_value",
302        method!(RbMetadata::default_field_value, 0),
303    )?;
304    metadata_class.define_method("flexible_csv", method!(RbMetadata::flexible_csv, 0))?;
305    metadata_class.define_method(
306        "skip_invalid_cost_or_id",
307        method!(RbMetadata::skip_invalid_cost_or_id, 0),
308    )?;
309    metadata_class.define_method(
310        "normalize_details",
311        method!(RbMetadata::normalize_details, 0),
312    )?;
313    metadata_class.define_method("to_hash", method!(RbMetadata::to_hash, 0))?;
314    metadata_class.define_method("to_h", method!(RbMetadata::to_hash, 0))?;
315    metadata_class.define_method("to_s", method!(RbMetadata::to_s, 0))?;
316    metadata_class.define_method("inspect", method!(RbMetadata::inspect, 0))?;
317
318    Ok(())
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    #[test]
326    fn test_rb_metadata_to_lindera_metadata() {
327        let rb_metadata = RbMetadata {
328            name: "test_dict".to_string(),
329            encoding: "EUC-JP".to_string(),
330            default_word_cost: -5000,
331            default_left_context_id: 100,
332            default_right_context_id: 200,
333            default_field_value: "N/A".to_string(),
334            flexible_csv: true,
335            skip_invalid_cost_or_id: true,
336            normalize_details: true,
337            dictionary_schema: RbSchema::new_internal(vec![
338                "surface".to_string(),
339                "cost".to_string(),
340            ]),
341            user_dictionary_schema: RbSchema::new_internal(vec!["surface".to_string()]),
342        };
343
344        let lindera_metadata: Metadata = rb_metadata.into();
345        assert_eq!(lindera_metadata.name, "test_dict");
346        assert_eq!(lindera_metadata.encoding, "EUC-JP");
347        assert_eq!(lindera_metadata.default_word_cost, -5000);
348        assert_eq!(lindera_metadata.default_left_context_id, 100);
349        assert_eq!(lindera_metadata.default_right_context_id, 200);
350        assert_eq!(lindera_metadata.default_field_value, "N/A");
351        assert!(lindera_metadata.flexible_csv);
352        assert!(lindera_metadata.skip_invalid_cost_or_id);
353        assert!(lindera_metadata.normalize_details);
354        assert_eq!(lindera_metadata.dictionary_schema.get_all_fields().len(), 2);
355        assert_eq!(
356            lindera_metadata
357                .user_dictionary_schema
358                .get_all_fields()
359                .len(),
360            1
361        );
362    }
363
364    #[test]
365    fn test_lindera_metadata_to_rb_metadata() {
366        let dict_schema =
367            lindera::dictionary::Schema::new(vec!["surface".to_string(), "cost".to_string()]);
368        let user_schema =
369            lindera::dictionary::Schema::new(vec!["surface".to_string(), "reading".to_string()]);
370
371        let lindera_metadata = Metadata::new(
372            "my_dict".to_string(),
373            "UTF-8".to_string(),
374            -8000,
375            500,
376            600,
377            "?".to_string(),
378            false,
379            true,
380            false,
381            dict_schema,
382            user_schema,
383        );
384
385        let rb_metadata: RbMetadata = lindera_metadata.into();
386        assert_eq!(rb_metadata.name, "my_dict");
387        assert_eq!(rb_metadata.encoding, "UTF-8");
388        assert_eq!(rb_metadata.default_word_cost, -8000);
389        assert_eq!(rb_metadata.default_left_context_id, 500);
390        assert_eq!(rb_metadata.default_right_context_id, 600);
391        assert_eq!(rb_metadata.default_field_value, "?");
392        assert!(!rb_metadata.flexible_csv);
393        assert!(rb_metadata.skip_invalid_cost_or_id);
394        assert!(!rb_metadata.normalize_details);
395        assert_eq!(rb_metadata.dictionary_schema.fields.len(), 2);
396        assert_eq!(rb_metadata.user_dictionary_schema.fields.len(), 2);
397    }
398
399    #[test]
400    fn test_rb_metadata_roundtrip() {
401        let rb_metadata = RbMetadata {
402            name: "roundtrip".to_string(),
403            encoding: "UTF-8".to_string(),
404            default_word_cost: -10000,
405            default_left_context_id: 1288,
406            default_right_context_id: 1288,
407            default_field_value: "*".to_string(),
408            flexible_csv: false,
409            skip_invalid_cost_or_id: false,
410            normalize_details: false,
411            dictionary_schema: RbSchema::create_default_internal(),
412            user_dictionary_schema: RbSchema::new_internal(vec![
413                "surface".to_string(),
414                "reading".to_string(),
415                "pronunciation".to_string(),
416            ]),
417        };
418
419        let lindera: Metadata = rb_metadata.into();
420        let back: RbMetadata = lindera.into();
421        assert_eq!(back.name, "roundtrip");
422        assert_eq!(back.encoding, "UTF-8");
423        assert_eq!(back.default_word_cost, -10000);
424        assert_eq!(back.default_left_context_id, 1288);
425        assert_eq!(back.default_right_context_id, 1288);
426        assert_eq!(back.default_field_value, "*");
427        assert!(!back.flexible_csv);
428        assert!(!back.skip_invalid_cost_or_id);
429        assert!(!back.normalize_details);
430        assert_eq!(back.dictionary_schema.fields.len(), 13);
431        assert_eq!(back.user_dictionary_schema.fields.len(), 3);
432    }
433}