lindera/tokenizer.rs
1//! Tokenizer implementation for morphological analysis.
2//!
3//! This module provides a builder pattern for creating tokenizers and the tokenizer itself.
4//!
5//! # Examples
6//!
7//! ```python
8//! # Create a tokenizer with custom configuration
9//! tokenizer = (lindera.TokenizerBuilder()
10//! .set_mode("normal")
11//! .append_token_filter("japanese_stop_tags", {"tags": ["助詞"]})
12//! .build())
13//!
14//! # Tokenize text
15//! tokens = tokenizer.tokenize("すもももももももものうち")
16//! ```
17
18use std::path::Path;
19use std::str::FromStr;
20
21use pyo3::exceptions::PyValueError;
22use pyo3::prelude::*;
23use pyo3::types::PyDict;
24
25use lindera::mode::Mode;
26use lindera::segmenter::Segmenter;
27use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
28
29use crate::dictionary::{PyDictionary, PyUserDictionary};
30use crate::token::PyToken;
31use crate::util::pydict_to_value;
32
33pub type PyDictRef<'a> = &'a Bound<'a, PyDict>;
34
35/// Builder for creating a `Tokenizer` with custom configuration.
36///
37/// The builder pattern allows for fluent configuration of tokenizer parameters including
38/// dictionaries, modes, and filter pipelines.
39///
40/// # Examples
41///
42/// ```python
43/// builder = lindera.TokenizerBuilder()
44/// builder.set_mode("normal")
45/// builder.set_dictionary("/path/to/dict")
46/// tokenizer = builder.build()
47/// ```
48#[pyclass(name = "TokenizerBuilder")]
49pub struct PyTokenizerBuilder {
50 pub inner: TokenizerBuilder,
51}
52
53#[pymethods]
54impl PyTokenizerBuilder {
55 /// Creates a new `TokenizerBuilder` with default configuration.
56 ///
57 /// # Returns
58 ///
59 /// A new instance of `TokenizerBuilder`.
60 ///
61 /// # Errors
62 ///
63 /// Returns an error if the builder cannot be initialized.
64 #[new]
65 #[pyo3(signature = ())]
66 fn new() -> PyResult<Self> {
67 let inner = TokenizerBuilder::new().map_err(|err| {
68 PyValueError::new_err(format!("Failed to create TokenizerBuilder: {err}"))
69 })?;
70
71 Ok(Self { inner })
72 }
73
74 /// Loads configuration from a file.
75 ///
76 /// # Arguments
77 ///
78 /// * `file_path` - Path to the configuration file.
79 ///
80 /// # Returns
81 ///
82 /// A new `TokenizerBuilder` with the loaded configuration.
83 #[pyo3(signature = (file_path))]
84 #[allow(clippy::wrong_self_convention)]
85 fn from_file(&self, file_path: &str) -> PyResult<Self> {
86 let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| {
87 PyValueError::new_err(format!("Failed to load config from file: {err}"))
88 })?;
89
90 Ok(Self { inner })
91 }
92
93 /// Sets the tokenization mode.
94 ///
95 /// # Arguments
96 ///
97 /// * `mode` - Mode string ("normal" or "decompose").
98 ///
99 /// # Returns
100 ///
101 /// Self for method chaining.
102 #[pyo3(signature = (mode))]
103 fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult<PyRefMut<'a, Self>> {
104 let m = Mode::from_str(mode)
105 .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
106
107 slf.inner.set_segmenter_mode(&m);
108
109 Ok(slf)
110 }
111
112 /// Sets the dictionary path.
113 ///
114 /// # Arguments
115 ///
116 /// * `path` - Path to the dictionary directory.
117 ///
118 /// # Returns
119 ///
120 /// Self for method chaining.
121 #[pyo3(signature = (path))]
122 fn set_dictionary<'a>(mut slf: PyRefMut<'a, Self>, path: &str) -> PyResult<PyRefMut<'a, Self>> {
123 slf.inner.set_segmenter_dictionary(path);
124
125 Ok(slf)
126 }
127
128 /// Sets the user dictionary URI.
129 ///
130 /// # Arguments
131 ///
132 /// * `uri` - URI to the user dictionary.
133 ///
134 /// # Returns
135 ///
136 /// Self for method chaining.
137 #[pyo3(signature = (uri))]
138 fn set_user_dictionary<'a>(
139 mut slf: PyRefMut<'a, Self>,
140 uri: &str,
141 ) -> PyResult<PyRefMut<'a, Self>> {
142 slf.inner.set_segmenter_user_dictionary(uri);
143 Ok(slf)
144 }
145
146 /// Sets whether to keep whitespace in tokenization results.
147 ///
148 /// # Arguments
149 ///
150 /// * `keep_whitespace` - If true, whitespace tokens will be included in results.
151 ///
152 /// # Returns
153 ///
154 /// Self for method chaining.
155 #[pyo3(signature = (keep_whitespace))]
156 fn set_keep_whitespace<'a>(
157 mut slf: PyRefMut<'a, Self>,
158 keep_whitespace: bool,
159 ) -> PyResult<PyRefMut<'a, Self>> {
160 slf.inner.set_segmenter_keep_whitespace(keep_whitespace);
161 Ok(slf)
162 }
163
164 /// Appends a character filter to the filter pipeline.
165 ///
166 /// # Arguments
167 ///
168 /// * `kind` - Type of character filter to add.
169 /// * `args` - Optional dictionary of filter arguments.
170 ///
171 /// # Returns
172 ///
173 /// Self for method chaining.
174 #[pyo3(signature = (kind, args=None))]
175 fn append_character_filter<'a>(
176 mut slf: PyRefMut<'a, Self>,
177 kind: &str,
178 args: Option<&Bound<'_, PyDict>>,
179 ) -> PyResult<PyRefMut<'a, Self>> {
180 let filter_args = if let Some(dict) = args {
181 pydict_to_value(dict)?
182 } else {
183 serde_json::Value::Object(serde_json::Map::new())
184 };
185
186 slf.inner.append_character_filter(kind, &filter_args);
187
188 Ok(slf)
189 }
190
191 /// Appends a token filter to the filter pipeline.
192 ///
193 /// # Arguments
194 ///
195 /// * `kind` - Type of token filter to add.
196 /// * `args` - Optional dictionary of filter arguments.
197 ///
198 /// # Returns
199 ///
200 /// Self for method chaining.
201 #[pyo3(signature = (kind, args=None))]
202 fn append_token_filter<'a>(
203 mut slf: PyRefMut<'a, Self>,
204 kind: &str,
205 args: Option<&Bound<'_, PyDict>>,
206 ) -> PyResult<PyRefMut<'a, Self>> {
207 let filter_args = if let Some(dict) = args {
208 pydict_to_value(dict)?
209 } else {
210 serde_json::Value::Object(serde_json::Map::new())
211 };
212
213 slf.inner.append_token_filter(kind, &filter_args);
214
215 Ok(slf)
216 }
217
218 /// Builds the tokenizer with the configured settings.
219 ///
220 /// # Returns
221 ///
222 /// A configured `Tokenizer` instance ready for use.
223 ///
224 /// # Errors
225 ///
226 /// Returns an error if the tokenizer cannot be built with the current configuration.
227 #[pyo3(signature = ())]
228 fn build(&self) -> PyResult<PyTokenizer> {
229 let tokenizer = self
230 .inner
231 .build()
232 .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {err}")))?;
233
234 Ok(PyTokenizer { inner: tokenizer })
235 }
236}
237
238/// Tokenizer for performing morphological analysis.
239///
240/// The tokenizer processes text and returns tokens with their morphological features.
241///
242/// # Examples
243///
244/// ```python
245/// # Using TokenizerBuilder (recommended)
246/// tokenizer = lindera.TokenizerBuilder().build()
247///
248/// # Or create directly with a dictionary
249/// dictionary = lindera.load_dictionary("ipadic")
250/// tokenizer = lindera.Tokenizer(dictionary, mode="normal")
251/// ```
252#[pyclass(name = "Tokenizer")]
253pub struct PyTokenizer {
254 inner: Tokenizer,
255}
256
257#[pymethods]
258impl PyTokenizer {
259 /// Creates a new tokenizer with the given dictionary and mode.
260 ///
261 /// # Arguments
262 ///
263 /// * `dictionary` - Dictionary to use for tokenization.
264 /// * `mode` - Tokenization mode ("normal" or "decompose"). Default: "normal".
265 /// * `user_dictionary` - Optional user dictionary for custom words.
266 ///
267 /// # Returns
268 ///
269 /// A new `Tokenizer` instance.
270 #[new]
271 #[pyo3(signature = (dictionary, mode="normal", user_dictionary=None))]
272 fn new(
273 dictionary: PyDictionary,
274 mode: &str,
275 user_dictionary: Option<PyUserDictionary>,
276 ) -> PyResult<Self> {
277 let m = Mode::from_str(mode)
278 .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
279
280 let dict = dictionary.inner;
281 let user_dict = user_dictionary.map(|d| d.inner);
282
283 let segmenter = Segmenter::new(m, dict, user_dict);
284 let tokenizer = Tokenizer::new(segmenter);
285
286 Ok(Self { inner: tokenizer })
287 }
288
289 /// Tokenizes the given text.
290 ///
291 /// # Arguments
292 ///
293 /// * `text` - Text to tokenize.
294 ///
295 /// # Returns
296 ///
297 /// A list of Token objects containing morphological features.
298 ///
299 /// # Errors
300 ///
301 /// Returns an error if tokenization fails.
302 #[pyo3(signature = (text))]
303 fn tokenize(&self, text: &str) -> PyResult<Vec<PyToken>> {
304 // Tokenize the processed text
305 let tokens = self
306 .inner
307 .tokenize(text)
308 .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {err}")))?;
309
310 // Convert to PyToken objects
311 let py_tokens: Vec<PyToken> = tokens.into_iter().map(PyToken::from_token).collect();
312
313 Ok(py_tokens)
314 }
315}
316
317pub fn register(parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
318 let py = parent_module.py();
319 let m = PyModule::new(py, "tokenizer")?;
320 m.add_class::<PyTokenizerBuilder>()?;
321 m.add_class::<PyTokenizer>()?;
322 parent_module.add_submodule(&m)?;
323 Ok(())
324}