Skip to main content

lindera/
tokenizer.rs

1//! Tokenizer implementation for morphological analysis.
2//!
3//! This module provides a builder pattern for creating tokenizers and the tokenizer itself.
4//!
5//! # Examples
6//!
7//! ```python
8//! # Create a tokenizer with custom configuration
9//! tokenizer = (lindera.TokenizerBuilder()
10//!     .set_mode("normal")
11//!     .append_token_filter("japanese_stop_tags", {"tags": ["助詞"]})
12//!     .build())
13//!
14//! # Tokenize text
15//! tokens = tokenizer.tokenize("すもももももももものうち")
16//! ```
17
18use std::path::Path;
19use std::str::FromStr;
20
21use pyo3::exceptions::PyValueError;
22use pyo3::prelude::*;
23use pyo3::types::PyDict;
24
25use lindera::mode::Mode;
26use lindera::segmenter::Segmenter;
27use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
28
29use crate::dictionary::{PyDictionary, PyUserDictionary};
30use crate::token::PyToken;
31use crate::util::pydict_to_value;
32
33pub type PyDictRef<'a> = &'a Bound<'a, PyDict>;
34
35/// Builder for creating a `Tokenizer` with custom configuration.
36///
37/// The builder pattern allows for fluent configuration of tokenizer parameters including
38/// dictionaries, modes, and filter pipelines.
39///
40/// # Examples
41///
42/// ```python
43/// builder = lindera.TokenizerBuilder()
44/// builder.set_mode("normal")
45/// builder.set_dictionary("/path/to/dict")
46/// tokenizer = builder.build()
47/// ```
48#[pyclass(name = "TokenizerBuilder")]
49pub struct PyTokenizerBuilder {
50    pub inner: TokenizerBuilder,
51}
52
53#[pymethods]
54impl PyTokenizerBuilder {
55    /// Creates a new `TokenizerBuilder` with default configuration.
56    ///
57    /// # Returns
58    ///
59    /// A new instance of `TokenizerBuilder`.
60    ///
61    /// # Errors
62    ///
63    /// Returns an error if the builder cannot be initialized.
64    #[new]
65    #[pyo3(signature = ())]
66    fn new() -> PyResult<Self> {
67        let inner = TokenizerBuilder::new().map_err(|err| {
68            PyValueError::new_err(format!("Failed to create TokenizerBuilder: {err}"))
69        })?;
70
71        Ok(Self { inner })
72    }
73
74    /// Loads configuration from a file.
75    ///
76    /// # Arguments
77    ///
78    /// * `file_path` - Path to the configuration file.
79    ///
80    /// # Returns
81    ///
82    /// A new `TokenizerBuilder` with the loaded configuration.
83    #[pyo3(signature = (file_path))]
84    #[allow(clippy::wrong_self_convention)]
85    fn from_file(&self, file_path: &str) -> PyResult<Self> {
86        let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| {
87            PyValueError::new_err(format!("Failed to load config from file: {err}"))
88        })?;
89
90        Ok(Self { inner })
91    }
92
93    /// Sets the tokenization mode.
94    ///
95    /// # Arguments
96    ///
97    /// * `mode` - Mode string ("normal" or "decompose").
98    ///
99    /// # Returns
100    ///
101    /// Self for method chaining.
102    #[pyo3(signature = (mode))]
103    fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult<PyRefMut<'a, Self>> {
104        let m = Mode::from_str(mode)
105            .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
106
107        slf.inner.set_segmenter_mode(&m);
108
109        Ok(slf)
110    }
111
112    /// Sets the dictionary path.
113    ///
114    /// # Arguments
115    ///
116    /// * `path` - Path to the dictionary directory.
117    ///
118    /// # Returns
119    ///
120    /// Self for method chaining.
121    #[pyo3(signature = (path))]
122    fn set_dictionary<'a>(mut slf: PyRefMut<'a, Self>, path: &str) -> PyResult<PyRefMut<'a, Self>> {
123        slf.inner.set_segmenter_dictionary(path);
124
125        Ok(slf)
126    }
127
128    /// Sets the user dictionary URI.
129    ///
130    /// # Arguments
131    ///
132    /// * `uri` - URI to the user dictionary.
133    ///
134    /// # Returns
135    ///
136    /// Self for method chaining.
137    #[pyo3(signature = (uri))]
138    fn set_user_dictionary<'a>(
139        mut slf: PyRefMut<'a, Self>,
140        uri: &str,
141    ) -> PyResult<PyRefMut<'a, Self>> {
142        slf.inner.set_segmenter_user_dictionary(uri);
143        Ok(slf)
144    }
145
146    /// Sets whether to keep whitespace in tokenization results.
147    ///
148    /// # Arguments
149    ///
150    /// * `keep_whitespace` - If true, whitespace tokens will be included in results.
151    ///
152    /// # Returns
153    ///
154    /// Self for method chaining.
155    #[pyo3(signature = (keep_whitespace))]
156    fn set_keep_whitespace<'a>(
157        mut slf: PyRefMut<'a, Self>,
158        keep_whitespace: bool,
159    ) -> PyResult<PyRefMut<'a, Self>> {
160        slf.inner.set_segmenter_keep_whitespace(keep_whitespace);
161        Ok(slf)
162    }
163
164    /// Appends a character filter to the filter pipeline.
165    ///
166    /// # Arguments
167    ///
168    /// * `kind` - Type of character filter to add.
169    /// * `args` - Optional dictionary of filter arguments.
170    ///
171    /// # Returns
172    ///
173    /// Self for method chaining.
174    #[pyo3(signature = (kind, args=None))]
175    fn append_character_filter<'a>(
176        mut slf: PyRefMut<'a, Self>,
177        kind: &str,
178        args: Option<&Bound<'_, PyDict>>,
179    ) -> PyResult<PyRefMut<'a, Self>> {
180        let filter_args = if let Some(dict) = args {
181            pydict_to_value(dict)?
182        } else {
183            serde_json::Value::Object(serde_json::Map::new())
184        };
185
186        slf.inner.append_character_filter(kind, &filter_args);
187
188        Ok(slf)
189    }
190
191    /// Appends a token filter to the filter pipeline.
192    ///
193    /// # Arguments
194    ///
195    /// * `kind` - Type of token filter to add.
196    /// * `args` - Optional dictionary of filter arguments.
197    ///
198    /// # Returns
199    ///
200    /// Self for method chaining.
201    #[pyo3(signature = (kind, args=None))]
202    fn append_token_filter<'a>(
203        mut slf: PyRefMut<'a, Self>,
204        kind: &str,
205        args: Option<&Bound<'_, PyDict>>,
206    ) -> PyResult<PyRefMut<'a, Self>> {
207        let filter_args = if let Some(dict) = args {
208            pydict_to_value(dict)?
209        } else {
210            serde_json::Value::Object(serde_json::Map::new())
211        };
212
213        slf.inner.append_token_filter(kind, &filter_args);
214
215        Ok(slf)
216    }
217
218    /// Builds the tokenizer with the configured settings.
219    ///
220    /// # Returns
221    ///
222    /// A configured `Tokenizer` instance ready for use.
223    ///
224    /// # Errors
225    ///
226    /// Returns an error if the tokenizer cannot be built with the current configuration.
227    #[pyo3(signature = ())]
228    fn build(&self) -> PyResult<PyTokenizer> {
229        let tokenizer = self
230            .inner
231            .build()
232            .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {err}")))?;
233
234        Ok(PyTokenizer { inner: tokenizer })
235    }
236}
237
238/// Tokenizer for performing morphological analysis.
239///
240/// The tokenizer processes text and returns tokens with their morphological features.
241///
242/// # Examples
243///
244/// ```python
245/// # Using TokenizerBuilder (recommended)
246/// tokenizer = lindera.TokenizerBuilder().build()
247///
248/// # Or create directly with a dictionary
249/// dictionary = lindera.load_dictionary("ipadic")
250/// tokenizer = lindera.Tokenizer(dictionary, mode="normal")
251/// ```
252#[pyclass(name = "Tokenizer")]
253pub struct PyTokenizer {
254    inner: Tokenizer,
255}
256
257#[pymethods]
258impl PyTokenizer {
259    /// Creates a new tokenizer with the given dictionary and mode.
260    ///
261    /// # Arguments
262    ///
263    /// * `dictionary` - Dictionary to use for tokenization.
264    /// * `mode` - Tokenization mode ("normal" or "decompose"). Default: "normal".
265    /// * `user_dictionary` - Optional user dictionary for custom words.
266    ///
267    /// # Returns
268    ///
269    /// A new `Tokenizer` instance.
270    #[new]
271    #[pyo3(signature = (dictionary, mode="normal", user_dictionary=None))]
272    fn new(
273        dictionary: PyDictionary,
274        mode: &str,
275        user_dictionary: Option<PyUserDictionary>,
276    ) -> PyResult<Self> {
277        let m = Mode::from_str(mode)
278            .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
279
280        let dict = dictionary.inner;
281        let user_dict = user_dictionary.map(|d| d.inner);
282
283        let segmenter = Segmenter::new(m, dict, user_dict);
284        let tokenizer = Tokenizer::new(segmenter);
285
286        Ok(Self { inner: tokenizer })
287    }
288
289    /// Tokenizes the given text.
290    ///
291    /// # Arguments
292    ///
293    /// * `text` - Text to tokenize.
294    ///
295    /// # Returns
296    ///
297    /// A list of Token objects containing morphological features.
298    ///
299    /// # Errors
300    ///
301    /// Returns an error if tokenization fails.
302    #[pyo3(signature = (text))]
303    fn tokenize(&self, text: &str) -> PyResult<Vec<PyToken>> {
304        // Tokenize the processed text
305        let tokens = self
306            .inner
307            .tokenize(text)
308            .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {err}")))?;
309
310        // Convert to PyToken objects
311        let py_tokens: Vec<PyToken> = tokens.into_iter().map(PyToken::from_token).collect();
312
313        Ok(py_tokens)
314    }
315
316    /// Tokenizes the given text and returns N-best results.
317    ///
318    /// # Arguments
319    ///
320    /// * `text` - Text to tokenize.
321    /// * `n` - Number of N-best results to return.
322    ///
323    /// # Returns
324    ///
325    /// A list of lists of Token objects, ordered by cost (best first).
326    ///
327    /// # Errors
328    ///
329    /// Returns an error if tokenization fails.
330    #[pyo3(signature = (text, n, unique=false, cost_threshold=None))]
331    fn tokenize_nbest(
332        &self,
333        text: &str,
334        n: usize,
335        unique: bool,
336        cost_threshold: Option<i64>,
337    ) -> PyResult<Vec<(Vec<PyToken>, i64)>> {
338        let results = self
339            .inner
340            .tokenize_nbest(text, n, unique, cost_threshold)
341            .map_err(|err| {
342                PyValueError::new_err(format!("Failed to tokenize_nbest text: {err}"))
343            })?;
344
345        let py_results: Vec<(Vec<PyToken>, i64)> = results
346            .into_iter()
347            .map(|(tokens, cost)| (tokens.into_iter().map(PyToken::from_token).collect(), cost))
348            .collect();
349
350        Ok(py_results)
351    }
352}
353
354pub fn register(parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
355    let py = parent_module.py();
356    let m = PyModule::new(py, "tokenizer")?;
357    m.add_class::<PyTokenizerBuilder>()?;
358    m.add_class::<PyTokenizer>()?;
359    parent_module.add_submodule(&m)?;
360    Ok(())
361}