lindera/dictionary.rs
1//! Dictionary management for morphological analysis.
2//!
3//! This module provides functionality for building, loading, and managing dictionaries
4//! used in morphological analysis.
5//!
6//! # Dictionary Types
7//!
8//! - **Dictionary**: Main dictionary for morphological analysis
9//! - **UserDictionary**: Custom user-defined dictionary for additional words
10//!
11//! # Examples
12//!
13//! ```python
14//! import lindera
15//!
16//! # Load a pre-built dictionary
17//! dictionary = lindera.load_dictionary("ipadic")
18//!
19//! # Build a custom dictionary
20//! metadata = lindera.Metadata()
21//! lindera.build_dictionary("/path/to/input", "/path/to/output", metadata)
22//!
23//! # Build a user dictionary
24//! lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
25//! ```
26
27use std::path::Path;
28
29use pyo3::{exceptions::PyValueError, prelude::*};
30
31use lindera::dictionary::{
32 Dictionary, DictionaryBuilder, Metadata, UserDictionary,
33 load_dictionary as lindera_load_dictionary,
34 load_user_dictionary as lindera_load_user_dictionary,
35};
36
37use crate::metadata::PyMetadata;
38
39/// A morphological analysis dictionary.
40///
41/// Contains the data structures needed for tokenization and morphological analysis.
42///
43/// # Examples
44///
45/// ```python
46/// # Load a dictionary
47/// dictionary = lindera.load_dictionary("ipadic")
48///
49/// # Access metadata
50/// print(dictionary.metadata_name())
51/// print(dictionary.metadata_encoding())
52/// ```
53#[pyclass(name = "Dictionary")]
54#[derive(Clone)]
55pub struct PyDictionary {
56 pub inner: Dictionary,
57}
58
59#[pymethods]
60impl PyDictionary {
61 /// Returns the name of the dictionary metadata.
62 pub fn metadata_name(&self) -> String {
63 self.inner.metadata.name.clone()
64 }
65
66 /// Returns the character encoding of the dictionary.
67 pub fn metadata_encoding(&self) -> String {
68 self.inner.metadata.encoding.clone()
69 }
70
71 /// Returns the full metadata object of the dictionary.
72 pub fn metadata(&self) -> PyMetadata {
73 PyMetadata::from(self.inner.metadata.clone())
74 }
75
76 fn __str__(&self) -> String {
77 "Dictionary".to_string()
78 }
79
80 fn __repr__(&self) -> String {
81 "Dictionary()".to_string()
82 }
83}
84
85impl PyDictionary {
86 // Internal helper function to create PyDictionary from Lindera Dictionary
87 pub fn new(dictionary: Dictionary) -> Self {
88 Self { inner: dictionary }
89 }
90}
91
92/// A user-defined dictionary for custom words.
93///
94/// User dictionaries allow you to add custom words and their morphological features
95/// that are not present in the main dictionary.
96///
97/// # Examples
98///
99/// ```python
100/// # Build a user dictionary
101/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
102///
103/// # Load it
104/// metadata = lindera.Metadata()
105/// user_dict = lindera.load_user_dictionary("/path/to/output", metadata)
106/// ```
107#[pyclass(name = "UserDictionary")]
108#[derive(Clone)]
109pub struct PyUserDictionary {
110 pub inner: UserDictionary,
111}
112
113#[pymethods]
114impl PyUserDictionary {
115 fn __str__(&self) -> String {
116 "UserDictionary".to_string()
117 }
118
119 fn __repr__(&self) -> String {
120 "UserDictionary()".to_string()
121 }
122}
123
124impl PyUserDictionary {
125 // Internal helper function to create PyUserDictionary from Lindera UserDictionary
126 pub fn new(user_dictionary: UserDictionary) -> Self {
127 Self {
128 inner: user_dictionary,
129 }
130 }
131}
132
133/// Builds a dictionary from source files.
134///
135/// # Arguments
136///
137/// * `input_dir` - Directory containing dictionary source files.
138/// * `output_dir` - Directory where the built dictionary will be saved.
139/// * `metadata` - Metadata configuration for the dictionary.
140///
141/// # Errors
142///
143/// Returns an error if the input directory doesn't exist or if the build fails.
144///
145/// # Examples
146///
147/// ```python
148/// metadata = lindera.Metadata(name="custom", encoding="UTF-8")
149/// lindera.build_dictionary("/path/to/input", "/path/to/output", metadata)
150/// ```
151#[pyfunction]
152#[pyo3(signature = (input_dir, output_dir, metadata))]
153pub fn build_dictionary(input_dir: &str, output_dir: &str, metadata: PyMetadata) -> PyResult<()> {
154 let input_path = Path::new(input_dir);
155 let output_path = Path::new(output_dir);
156
157 if !input_path.exists() {
158 return Err(PyValueError::new_err(format!(
159 "Input directory does not exist: {input_dir}"
160 )));
161 }
162
163 let builder = DictionaryBuilder::new(metadata.into());
164
165 builder
166 .build_dictionary(input_path, output_path)
167 .map_err(|e| PyValueError::new_err(format!("Failed to build dictionary: {e}")))?;
168
169 Ok(())
170}
171
172/// Builds a user dictionary from a CSV file.
173///
174/// # Arguments
175///
176/// * `_kind` - Dictionary kind (currently unused, reserved for future use).
177/// * `input_file` - Path to the CSV file containing user dictionary entries.
178/// * `output_dir` - Directory where the built user dictionary will be saved.
179/// * `metadata` - Optional metadata configuration. If None, default values are used.
180///
181/// # CSV Format
182///
183/// The CSV file should contain entries in the format specified by the dictionary schema.
184/// Typically: surface,reading,pronunciation
185///
186/// # Errors
187///
188/// Returns an error if the input file doesn't exist or if the build fails.
189///
190/// # Examples
191///
192/// ```python
193/// # Build with default metadata
194/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
195///
196/// # Build with custom metadata
197/// metadata = lindera.Metadata()
198/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output", metadata)
199/// ```
200#[pyfunction]
201#[pyo3(signature = (_kind, input_file, output_dir, metadata=None))]
202pub fn build_user_dictionary(
203 _kind: &str,
204 input_file: &str,
205 output_dir: &str,
206 metadata: Option<crate::metadata::PyMetadata>,
207) -> PyResult<()> {
208 let input_path = Path::new(input_file);
209 let output_path = Path::new(output_dir);
210
211 if !input_path.exists() {
212 return Err(PyValueError::new_err(format!(
213 "Input file does not exist: {input_file}"
214 )));
215 }
216
217 // Use provided metadata or create default
218 let meta = match metadata {
219 Some(py_metadata) => {
220 let lindera_meta: Metadata = py_metadata.into();
221 lindera_meta
222 }
223 None => Metadata::default(),
224 };
225
226 let builder = DictionaryBuilder::new(meta);
227
228 // Build user dictionary from CSV
229 builder
230 .build_user_dictionary(input_path, output_path)
231 .map_err(|e| PyValueError::new_err(format!("Failed to build user dictionary: {e}")))?;
232
233 Ok(())
234}
235
236/// Loads a dictionary from the specified URI.
237///
238/// # Arguments
239///
240/// * `uri` - URI to the dictionary. Can be a file path or embedded dictionary name.
241///
242/// # Supported URIs
243///
244/// - File paths: `/path/to/dictionary`
245/// - Embedded dictionaries: `ipadic`, `unidic`, `ko-dic`, `cc-cedict`
246///
247/// # Returns
248///
249/// A loaded `Dictionary` object.
250///
251/// # Errors
252///
253/// Returns an error if the dictionary cannot be loaded from the specified URI.
254///
255/// # Examples
256///
257/// ```python
258/// # Load an embedded dictionary
259/// dict = lindera.load_dictionary("ipadic")
260///
261/// # Load from file path
262/// dict = lindera.load_dictionary("/path/to/dictionary")
263/// ```
264#[pyfunction]
265#[pyo3(signature = (uri))]
266pub fn load_dictionary(uri: &str) -> PyResult<PyDictionary> {
267 lindera_load_dictionary(uri)
268 .map_err(|e| PyValueError::new_err(format!("Failed to load dictionary from '{uri}': {e}")))
269 .map(PyDictionary::new)
270}
271
272/// Loads a user dictionary from the specified URI.
273///
274/// # Arguments
275///
276/// * `uri` - URI to the user dictionary directory.
277/// * `metadata` - Metadata configuration for the user dictionary.
278///
279/// # Returns
280///
281/// A loaded `UserDictionary` object.
282///
283/// # Errors
284///
285/// Returns an error if the user dictionary cannot be loaded.
286///
287/// # Examples
288///
289/// ```python
290/// metadata = lindera.Metadata()
291/// user_dict = lindera.load_user_dictionary("/path/to/user_dict", metadata)
292/// ```
293#[pyfunction]
294#[pyo3(signature = (uri, metadata))]
295pub fn load_user_dictionary(uri: &str, metadata: PyMetadata) -> PyResult<PyUserDictionary> {
296 let meta: Metadata = metadata.into();
297 lindera_load_user_dictionary(uri, &meta)
298 .map_err(|e| {
299 PyValueError::new_err(format!("Failed to load user dictionary from '{uri}': {e}"))
300 })
301 .map(PyUserDictionary::new)
302}