lindera/
dictionary.rs

1//! Dictionary management for morphological analysis.
2//!
3//! This module provides functionality for building, loading, and managing dictionaries
4//! used in morphological analysis.
5//!
6//! # Dictionary Types
7//!
8//! - **Dictionary**: Main dictionary for morphological analysis
9//! - **UserDictionary**: Custom user-defined dictionary for additional words
10//!
11//! # Examples
12//!
13//! ```python
14//! import lindera
15//!
16//! # Load a pre-built dictionary
17//! dictionary = lindera.load_dictionary("ipadic")
18//!
19//! # Build a custom dictionary
20//! metadata = lindera.Metadata()
21//! lindera.build_dictionary("/path/to/input", "/path/to/output", metadata)
22//!
23//! # Build a user dictionary
24//! lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
25//! ```
26
27use std::path::Path;
28
29use pyo3::{exceptions::PyValueError, prelude::*};
30
31use lindera::dictionary::{
32    Dictionary, DictionaryBuilder, Metadata, UserDictionary,
33    load_dictionary as lindera_load_dictionary,
34    load_user_dictionary as lindera_load_user_dictionary,
35};
36
37use crate::metadata::PyMetadata;
38
39/// A morphological analysis dictionary.
40///
41/// Contains the data structures needed for tokenization and morphological analysis.
42///
43/// # Examples
44///
45/// ```python
46/// # Load a dictionary
47/// dictionary = lindera.load_dictionary("ipadic")
48///
49/// # Access metadata
50/// print(dictionary.metadata_name())
51/// print(dictionary.metadata_encoding())
52/// ```
53#[pyclass(name = "Dictionary")]
54#[derive(Clone)]
55pub struct PyDictionary {
56    pub inner: Dictionary,
57}
58
59#[pymethods]
60impl PyDictionary {
61    /// Returns the name of the dictionary metadata.
62    pub fn metadata_name(&self) -> String {
63        self.inner.metadata.name.clone()
64    }
65
66    /// Returns the character encoding of the dictionary.
67    pub fn metadata_encoding(&self) -> String {
68        self.inner.metadata.encoding.clone()
69    }
70
71    /// Returns the full metadata object of the dictionary.
72    pub fn metadata(&self) -> PyMetadata {
73        PyMetadata::from(self.inner.metadata.clone())
74    }
75
76    fn __str__(&self) -> String {
77        "Dictionary".to_string()
78    }
79
80    fn __repr__(&self) -> String {
81        "Dictionary()".to_string()
82    }
83}
84
85impl PyDictionary {
86    // Internal helper function to create PyDictionary from Lindera Dictionary
87    pub fn new(dictionary: Dictionary) -> Self {
88        Self { inner: dictionary }
89    }
90}
91
92/// A user-defined dictionary for custom words.
93///
94/// User dictionaries allow you to add custom words and their morphological features
95/// that are not present in the main dictionary.
96///
97/// # Examples
98///
99/// ```python
100/// # Build a user dictionary
101/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
102///
103/// # Load it
104/// metadata = lindera.Metadata()
105/// user_dict = lindera.load_user_dictionary("/path/to/output", metadata)
106/// ```
107#[pyclass(name = "UserDictionary")]
108#[derive(Clone)]
109pub struct PyUserDictionary {
110    pub inner: UserDictionary,
111}
112
113#[pymethods]
114impl PyUserDictionary {
115    fn __str__(&self) -> String {
116        "UserDictionary".to_string()
117    }
118
119    fn __repr__(&self) -> String {
120        "UserDictionary()".to_string()
121    }
122}
123
124impl PyUserDictionary {
125    // Internal helper function to create PyUserDictionary from Lindera UserDictionary
126    pub fn new(user_dictionary: UserDictionary) -> Self {
127        Self {
128            inner: user_dictionary,
129        }
130    }
131}
132
133/// Builds a dictionary from source files.
134///
135/// # Arguments
136///
137/// * `input_dir` - Directory containing dictionary source files.
138/// * `output_dir` - Directory where the built dictionary will be saved.
139/// * `metadata` - Metadata configuration for the dictionary.
140///
141/// # Errors
142///
143/// Returns an error if the input directory doesn't exist or if the build fails.
144///
145/// # Examples
146///
147/// ```python
148/// metadata = lindera.Metadata(name="custom", encoding="UTF-8")
149/// lindera.build_dictionary("/path/to/input", "/path/to/output", metadata)
150/// ```
151#[pyfunction]
152#[pyo3(signature = (input_dir, output_dir, metadata))]
153pub fn build_dictionary(input_dir: &str, output_dir: &str, metadata: PyMetadata) -> PyResult<()> {
154    let input_path = Path::new(input_dir);
155    let output_path = Path::new(output_dir);
156
157    if !input_path.exists() {
158        return Err(PyValueError::new_err(format!(
159            "Input directory does not exist: {input_dir}"
160        )));
161    }
162
163    let builder = DictionaryBuilder::new(metadata.into());
164
165    builder
166        .build_dictionary(input_path, output_path)
167        .map_err(|e| PyValueError::new_err(format!("Failed to build dictionary: {e}")))?;
168
169    Ok(())
170}
171
172/// Builds a user dictionary from a CSV file.
173///
174/// # Arguments
175///
176/// * `_kind` - Dictionary kind (currently unused, reserved for future use).
177/// * `input_file` - Path to the CSV file containing user dictionary entries.
178/// * `output_dir` - Directory where the built user dictionary will be saved.
179/// * `metadata` - Optional metadata configuration. If None, default values are used.
180///
181/// # CSV Format
182///
183/// The CSV file should contain entries in the format specified by the dictionary schema.
184/// Typically: surface,reading,pronunciation
185///
186/// # Errors
187///
188/// Returns an error if the input file doesn't exist or if the build fails.
189///
190/// # Examples
191///
192/// ```python
193/// # Build with default metadata
194/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
195///
196/// # Build with custom metadata
197/// metadata = lindera.Metadata()
198/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output", metadata)
199/// ```
200#[pyfunction]
201#[pyo3(signature = (_kind, input_file, output_dir, metadata=None))]
202pub fn build_user_dictionary(
203    _kind: &str,
204    input_file: &str,
205    output_dir: &str,
206    metadata: Option<crate::metadata::PyMetadata>,
207) -> PyResult<()> {
208    let input_path = Path::new(input_file);
209    let output_path = Path::new(output_dir);
210
211    if !input_path.exists() {
212        return Err(PyValueError::new_err(format!(
213            "Input file does not exist: {input_file}"
214        )));
215    }
216
217    // Use provided metadata or create default
218    let meta = match metadata {
219        Some(py_metadata) => {
220            let lindera_meta: Metadata = py_metadata.into();
221            lindera_meta
222        }
223        None => Metadata::default(),
224    };
225
226    let builder = DictionaryBuilder::new(meta);
227
228    // Build user dictionary from CSV
229    builder
230        .build_user_dictionary(input_path, output_path)
231        .map_err(|e| PyValueError::new_err(format!("Failed to build user dictionary: {e}")))?;
232
233    Ok(())
234}
235
236/// Loads a dictionary from the specified URI.
237///
238/// # Arguments
239///
240/// * `uri` - URI to the dictionary. Can be a file path or embedded dictionary name.
241///
242/// # Supported URIs
243///
244/// - File paths: `/path/to/dictionary`
245/// - Embedded dictionaries: `ipadic`, `unidic`, `ko-dic`, `cc-cedict`
246///
247/// # Returns
248///
249/// A loaded `Dictionary` object.
250///
251/// # Errors
252///
253/// Returns an error if the dictionary cannot be loaded from the specified URI.
254///
255/// # Examples
256///
257/// ```python
258/// # Load an embedded dictionary
259/// dict = lindera.load_dictionary("ipadic")
260///
261/// # Load from file path
262/// dict = lindera.load_dictionary("/path/to/dictionary")
263/// ```
264#[pyfunction]
265#[pyo3(signature = (uri))]
266pub fn load_dictionary(uri: &str) -> PyResult<PyDictionary> {
267    lindera_load_dictionary(uri)
268        .map_err(|e| PyValueError::new_err(format!("Failed to load dictionary from '{uri}': {e}")))
269        .map(PyDictionary::new)
270}
271
272/// Loads a user dictionary from the specified URI.
273///
274/// # Arguments
275///
276/// * `uri` - URI to the user dictionary directory.
277/// * `metadata` - Metadata configuration for the user dictionary.
278///
279/// # Returns
280///
281/// A loaded `UserDictionary` object.
282///
283/// # Errors
284///
285/// Returns an error if the user dictionary cannot be loaded.
286///
287/// # Examples
288///
289/// ```python
290/// metadata = lindera.Metadata()
291/// user_dict = lindera.load_user_dictionary("/path/to/user_dict", metadata)
292/// ```
293#[pyfunction]
294#[pyo3(signature = (uri, metadata))]
295pub fn load_user_dictionary(uri: &str, metadata: PyMetadata) -> PyResult<PyUserDictionary> {
296    let meta: Metadata = metadata.into();
297    lindera_load_user_dictionary(uri, &meta)
298        .map_err(|e| {
299            PyValueError::new_err(format!("Failed to load user dictionary from '{uri}': {e}"))
300        })
301        .map(PyUserDictionary::new)
302}