lindera/
tokenizer.rs

1//! Tokenizer implementation for morphological analysis.
2//!
3//! This module provides a builder pattern for creating tokenizers and the tokenizer itself.
4//!
5//! # Examples
6//!
7//! ```python
8//! # Create a tokenizer with custom configuration
9//! tokenizer = (lindera.TokenizerBuilder()
10//!     .set_mode("normal")
11//!     .append_token_filter("japanese_stop_tags", {"tags": ["助詞"]})
12//!     .build())
13//!
14//! # Tokenize text
15//! tokens = tokenizer.tokenize("すもももももももものうち")
16//! ```
17
18use std::path::Path;
19use std::str::FromStr;
20
21use pyo3::exceptions::PyValueError;
22use pyo3::prelude::*;
23use pyo3::types::PyDict;
24
25use lindera::mode::Mode;
26use lindera::segmenter::Segmenter;
27use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
28
29use crate::dictionary::{PyDictionary, PyUserDictionary};
30use crate::util::{pydict_to_value, value_to_pydict};
31
32pub type PyDictRef<'a> = &'a Bound<'a, PyDict>;
33
34/// Builder for creating a `Tokenizer` with custom configuration.
35///
36/// The builder pattern allows for fluent configuration of tokenizer parameters including
37/// dictionaries, modes, and filter pipelines.
38///
39/// # Examples
40///
41/// ```python
42/// builder = lindera.TokenizerBuilder()
43/// builder.set_mode("normal")
44/// builder.set_dictionary("/path/to/dict")
45/// tokenizer = builder.build()
46/// ```
47#[pyclass(name = "TokenizerBuilder")]
48pub struct PyTokenizerBuilder {
49    pub inner: TokenizerBuilder,
50}
51
52#[pymethods]
53impl PyTokenizerBuilder {
54    /// Creates a new `TokenizerBuilder` with default configuration.
55    ///
56    /// # Returns
57    ///
58    /// A new instance of `TokenizerBuilder`.
59    ///
60    /// # Errors
61    ///
62    /// Returns an error if the builder cannot be initialized.
63    #[new]
64    #[pyo3(signature = ())]
65    fn new() -> PyResult<Self> {
66        let inner = TokenizerBuilder::new().map_err(|err| {
67            PyValueError::new_err(format!("Failed to create TokenizerBuilder: {err}"))
68        })?;
69
70        Ok(Self { inner })
71    }
72
73    /// Loads configuration from a file.
74    ///
75    /// # Arguments
76    ///
77    /// * `file_path` - Path to the configuration file.
78    ///
79    /// # Returns
80    ///
81    /// A new `TokenizerBuilder` with the loaded configuration.
82    #[pyo3(signature = (file_path))]
83    #[allow(clippy::wrong_self_convention)]
84    fn from_file(&self, file_path: &str) -> PyResult<Self> {
85        let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| {
86            PyValueError::new_err(format!("Failed to load config from file: {err}"))
87        })?;
88
89        Ok(Self { inner })
90    }
91
92    /// Sets the tokenization mode.
93    ///
94    /// # Arguments
95    ///
96    /// * `mode` - Mode string ("normal" or "decompose").
97    ///
98    /// # Returns
99    ///
100    /// Self for method chaining.
101    #[pyo3(signature = (mode))]
102    fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult<PyRefMut<'a, Self>> {
103        let m = Mode::from_str(mode)
104            .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
105
106        slf.inner.set_segmenter_mode(&m);
107
108        Ok(slf)
109    }
110
111    /// Sets the dictionary path.
112    ///
113    /// # Arguments
114    ///
115    /// * `path` - Path to the dictionary directory.
116    ///
117    /// # Returns
118    ///
119    /// Self for method chaining.
120    #[pyo3(signature = (path))]
121    fn set_dictionary<'a>(mut slf: PyRefMut<'a, Self>, path: &str) -> PyResult<PyRefMut<'a, Self>> {
122        slf.inner.set_segmenter_dictionary(path);
123
124        Ok(slf)
125    }
126
127    /// Sets the user dictionary URI.
128    ///
129    /// # Arguments
130    ///
131    /// * `uri` - URI to the user dictionary.
132    ///
133    /// # Returns
134    ///
135    /// Self for method chaining.
136    #[pyo3(signature = (uri))]
137    fn set_user_dictionary<'a>(
138        mut slf: PyRefMut<'a, Self>,
139        uri: &str,
140    ) -> PyResult<PyRefMut<'a, Self>> {
141        slf.inner.set_segmenter_user_dictionary(uri);
142        Ok(slf)
143    }
144
145    /// Sets whether to keep whitespace in tokenization results.
146    ///
147    /// # Arguments
148    ///
149    /// * `keep_whitespace` - If true, whitespace tokens will be included in results.
150    ///
151    /// # Returns
152    ///
153    /// Self for method chaining.
154    #[pyo3(signature = (keep_whitespace))]
155    fn set_keep_whitespace<'a>(
156        mut slf: PyRefMut<'a, Self>,
157        keep_whitespace: bool,
158    ) -> PyResult<PyRefMut<'a, Self>> {
159        slf.inner.set_segmenter_keep_whitespace(keep_whitespace);
160        Ok(slf)
161    }
162
163    /// Appends a character filter to the filter pipeline.
164    ///
165    /// # Arguments
166    ///
167    /// * `kind` - Type of character filter to add.
168    /// * `args` - Optional dictionary of filter arguments.
169    ///
170    /// # Returns
171    ///
172    /// Self for method chaining.
173    #[pyo3(signature = (kind, args=None))]
174    fn append_character_filter<'a>(
175        mut slf: PyRefMut<'a, Self>,
176        kind: &str,
177        args: Option<&Bound<'_, PyDict>>,
178    ) -> PyResult<PyRefMut<'a, Self>> {
179        let filter_args = if let Some(dict) = args {
180            pydict_to_value(dict)?
181        } else {
182            serde_json::Value::Object(serde_json::Map::new())
183        };
184
185        slf.inner.append_character_filter(kind, &filter_args);
186
187        Ok(slf)
188    }
189
190    /// Appends a token filter to the filter pipeline.
191    ///
192    /// # Arguments
193    ///
194    /// * `kind` - Type of token filter to add.
195    /// * `args` - Optional dictionary of filter arguments.
196    ///
197    /// # Returns
198    ///
199    /// Self for method chaining.
200    #[pyo3(signature = (kind, args=None))]
201    fn append_token_filter<'a>(
202        mut slf: PyRefMut<'a, Self>,
203        kind: &str,
204        args: Option<&Bound<'_, PyDict>>,
205    ) -> PyResult<PyRefMut<'a, Self>> {
206        let filter_args = if let Some(dict) = args {
207            pydict_to_value(dict)?
208        } else {
209            serde_json::Value::Object(serde_json::Map::new())
210        };
211
212        slf.inner.append_token_filter(kind, &filter_args);
213
214        Ok(slf)
215    }
216
217    /// Builds the tokenizer with the configured settings.
218    ///
219    /// # Returns
220    ///
221    /// A configured `Tokenizer` instance ready for use.
222    ///
223    /// # Errors
224    ///
225    /// Returns an error if the tokenizer cannot be built with the current configuration.
226    #[pyo3(signature = ())]
227    fn build(&self) -> PyResult<PyTokenizer> {
228        let tokenizer = self
229            .inner
230            .build()
231            .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {err}")))?;
232
233        Ok(PyTokenizer { inner: tokenizer })
234    }
235}
236
237/// Tokenizer for performing morphological analysis.
238///
239/// The tokenizer processes text and returns tokens with their morphological features.
240///
241/// # Examples
242///
243/// ```python
244/// # Using TokenizerBuilder (recommended)
245/// tokenizer = lindera.TokenizerBuilder().build()
246///
247/// # Or create directly with a dictionary
248/// dictionary = lindera.load_dictionary("ipadic")
249/// tokenizer = lindera.Tokenizer(dictionary, mode="normal")
250/// ```
251#[pyclass(name = "Tokenizer")]
252pub struct PyTokenizer {
253    inner: Tokenizer,
254}
255
256#[pymethods]
257impl PyTokenizer {
258    /// Creates a new tokenizer with the given dictionary and mode.
259    ///
260    /// # Arguments
261    ///
262    /// * `dictionary` - Dictionary to use for tokenization.
263    /// * `mode` - Tokenization mode ("normal" or "decompose"). Default: "normal".
264    /// * `user_dictionary` - Optional user dictionary for custom words.
265    ///
266    /// # Returns
267    ///
268    /// A new `Tokenizer` instance.
269    #[new]
270    #[pyo3(signature = (dictionary, mode="normal", user_dictionary=None))]
271    fn new(
272        dictionary: PyDictionary,
273        mode: &str,
274        user_dictionary: Option<PyUserDictionary>,
275    ) -> PyResult<Self> {
276        let m = Mode::from_str(mode)
277            .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
278
279        let dict = dictionary.inner;
280        let user_dict = user_dictionary.map(|d| d.inner);
281
282        let segmenter = Segmenter::new(m, dict, user_dict);
283        let tokenizer = Tokenizer::new(segmenter);
284
285        Ok(Self { inner: tokenizer })
286    }
287
288    /// Tokenizes the given text.
289    ///
290    /// # Arguments
291    ///
292    /// * `text` - Text to tokenize.
293    ///
294    /// # Returns
295    ///
296    /// A list of token dictionaries containing morphological features.
297    ///
298    /// # Errors
299    ///
300    /// Returns an error if tokenization fails.
301    #[pyo3(signature = (text))]
302    fn tokenize(&self, py: Python<'_>, text: &str) -> PyResult<Vec<Py<PyAny>>> {
303        // Tokenize the processed text
304        let mut tokens = self
305            .inner
306            .tokenize(text)
307            .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {err}")))?;
308
309        // Convert to Python dictionaries
310        let py_tokens: Vec<Py<PyAny>> = tokens
311            .iter_mut()
312            .map(|t| {
313                let v = t.as_value();
314                value_to_pydict(py, &v).map_err(|err| {
315                    PyValueError::new_err(format!("Failed to convert token to dict: {err}"))
316                })
317            })
318            .collect::<Result<Vec<_>, _>>()?;
319
320        Ok(py_tokens)
321    }
322}