lindera/tokenizer.rs
1//! Tokenizer implementation for morphological analysis.
2//!
3//! This module provides a builder pattern for creating tokenizers and the tokenizer itself.
4//!
5//! # Examples
6//!
7//! ```python
8//! # Create a tokenizer with custom configuration
9//! tokenizer = (lindera.TokenizerBuilder()
10//! .set_mode("normal")
11//! .append_token_filter("japanese_stop_tags", {"tags": ["助詞"]})
12//! .build())
13//!
14//! # Tokenize text
15//! tokens = tokenizer.tokenize("すもももももももものうち")
16//! ```
17
18use std::path::Path;
19use std::str::FromStr;
20
21use pyo3::exceptions::PyValueError;
22use pyo3::prelude::*;
23use pyo3::types::PyDict;
24
25use lindera::mode::Mode;
26use lindera::segmenter::Segmenter;
27use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
28
29use crate::dictionary::{PyDictionary, PyUserDictionary};
30use crate::util::{pydict_to_value, value_to_pydict};
31
32pub type PyDictRef<'a> = &'a Bound<'a, PyDict>;
33
34/// Builder for creating a `Tokenizer` with custom configuration.
35///
36/// The builder pattern allows for fluent configuration of tokenizer parameters including
37/// dictionaries, modes, and filter pipelines.
38///
39/// # Examples
40///
41/// ```python
42/// builder = lindera.TokenizerBuilder()
43/// builder.set_mode("normal")
44/// builder.set_dictionary("/path/to/dict")
45/// tokenizer = builder.build()
46/// ```
47#[pyclass(name = "TokenizerBuilder")]
48pub struct PyTokenizerBuilder {
49 pub inner: TokenizerBuilder,
50}
51
52#[pymethods]
53impl PyTokenizerBuilder {
54 /// Creates a new `TokenizerBuilder` with default configuration.
55 ///
56 /// # Returns
57 ///
58 /// A new instance of `TokenizerBuilder`.
59 ///
60 /// # Errors
61 ///
62 /// Returns an error if the builder cannot be initialized.
63 #[new]
64 #[pyo3(signature = ())]
65 fn new() -> PyResult<Self> {
66 let inner = TokenizerBuilder::new().map_err(|err| {
67 PyValueError::new_err(format!("Failed to create TokenizerBuilder: {err}"))
68 })?;
69
70 Ok(Self { inner })
71 }
72
73 /// Loads configuration from a file.
74 ///
75 /// # Arguments
76 ///
77 /// * `file_path` - Path to the configuration file.
78 ///
79 /// # Returns
80 ///
81 /// A new `TokenizerBuilder` with the loaded configuration.
82 #[pyo3(signature = (file_path))]
83 #[allow(clippy::wrong_self_convention)]
84 fn from_file(&self, file_path: &str) -> PyResult<Self> {
85 let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| {
86 PyValueError::new_err(format!("Failed to load config from file: {err}"))
87 })?;
88
89 Ok(Self { inner })
90 }
91
92 /// Sets the tokenization mode.
93 ///
94 /// # Arguments
95 ///
96 /// * `mode` - Mode string ("normal" or "decompose").
97 ///
98 /// # Returns
99 ///
100 /// Self for method chaining.
101 #[pyo3(signature = (mode))]
102 fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult<PyRefMut<'a, Self>> {
103 let m = Mode::from_str(mode)
104 .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
105
106 slf.inner.set_segmenter_mode(&m);
107
108 Ok(slf)
109 }
110
111 /// Sets the dictionary path.
112 ///
113 /// # Arguments
114 ///
115 /// * `path` - Path to the dictionary directory.
116 ///
117 /// # Returns
118 ///
119 /// Self for method chaining.
120 #[pyo3(signature = (path))]
121 fn set_dictionary<'a>(mut slf: PyRefMut<'a, Self>, path: &str) -> PyResult<PyRefMut<'a, Self>> {
122 slf.inner.set_segmenter_dictionary(path);
123
124 Ok(slf)
125 }
126
127 /// Sets the user dictionary URI.
128 ///
129 /// # Arguments
130 ///
131 /// * `uri` - URI to the user dictionary.
132 ///
133 /// # Returns
134 ///
135 /// Self for method chaining.
136 #[pyo3(signature = (uri))]
137 fn set_user_dictionary<'a>(
138 mut slf: PyRefMut<'a, Self>,
139 uri: &str,
140 ) -> PyResult<PyRefMut<'a, Self>> {
141 slf.inner.set_segmenter_user_dictionary(uri);
142 Ok(slf)
143 }
144
145 /// Sets whether to keep whitespace in tokenization results.
146 ///
147 /// # Arguments
148 ///
149 /// * `keep_whitespace` - If true, whitespace tokens will be included in results.
150 ///
151 /// # Returns
152 ///
153 /// Self for method chaining.
154 #[pyo3(signature = (keep_whitespace))]
155 fn set_keep_whitespace<'a>(
156 mut slf: PyRefMut<'a, Self>,
157 keep_whitespace: bool,
158 ) -> PyResult<PyRefMut<'a, Self>> {
159 slf.inner.set_segmenter_keep_whitespace(keep_whitespace);
160 Ok(slf)
161 }
162
163 /// Appends a character filter to the filter pipeline.
164 ///
165 /// # Arguments
166 ///
167 /// * `kind` - Type of character filter to add.
168 /// * `args` - Optional dictionary of filter arguments.
169 ///
170 /// # Returns
171 ///
172 /// Self for method chaining.
173 #[pyo3(signature = (kind, args=None))]
174 fn append_character_filter<'a>(
175 mut slf: PyRefMut<'a, Self>,
176 kind: &str,
177 args: Option<&Bound<'_, PyDict>>,
178 ) -> PyResult<PyRefMut<'a, Self>> {
179 let filter_args = if let Some(dict) = args {
180 pydict_to_value(dict)?
181 } else {
182 serde_json::Value::Object(serde_json::Map::new())
183 };
184
185 slf.inner.append_character_filter(kind, &filter_args);
186
187 Ok(slf)
188 }
189
190 /// Appends a token filter to the filter pipeline.
191 ///
192 /// # Arguments
193 ///
194 /// * `kind` - Type of token filter to add.
195 /// * `args` - Optional dictionary of filter arguments.
196 ///
197 /// # Returns
198 ///
199 /// Self for method chaining.
200 #[pyo3(signature = (kind, args=None))]
201 fn append_token_filter<'a>(
202 mut slf: PyRefMut<'a, Self>,
203 kind: &str,
204 args: Option<&Bound<'_, PyDict>>,
205 ) -> PyResult<PyRefMut<'a, Self>> {
206 let filter_args = if let Some(dict) = args {
207 pydict_to_value(dict)?
208 } else {
209 serde_json::Value::Object(serde_json::Map::new())
210 };
211
212 slf.inner.append_token_filter(kind, &filter_args);
213
214 Ok(slf)
215 }
216
217 /// Builds the tokenizer with the configured settings.
218 ///
219 /// # Returns
220 ///
221 /// A configured `Tokenizer` instance ready for use.
222 ///
223 /// # Errors
224 ///
225 /// Returns an error if the tokenizer cannot be built with the current configuration.
226 #[pyo3(signature = ())]
227 fn build(&self) -> PyResult<PyTokenizer> {
228 let tokenizer = self
229 .inner
230 .build()
231 .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {err}")))?;
232
233 Ok(PyTokenizer { inner: tokenizer })
234 }
235}
236
237/// Tokenizer for performing morphological analysis.
238///
239/// The tokenizer processes text and returns tokens with their morphological features.
240///
241/// # Examples
242///
243/// ```python
244/// # Using TokenizerBuilder (recommended)
245/// tokenizer = lindera.TokenizerBuilder().build()
246///
247/// # Or create directly with a dictionary
248/// dictionary = lindera.load_dictionary("ipadic")
249/// tokenizer = lindera.Tokenizer(dictionary, mode="normal")
250/// ```
251#[pyclass(name = "Tokenizer")]
252pub struct PyTokenizer {
253 inner: Tokenizer,
254}
255
256#[pymethods]
257impl PyTokenizer {
258 /// Creates a new tokenizer with the given dictionary and mode.
259 ///
260 /// # Arguments
261 ///
262 /// * `dictionary` - Dictionary to use for tokenization.
263 /// * `mode` - Tokenization mode ("normal" or "decompose"). Default: "normal".
264 /// * `user_dictionary` - Optional user dictionary for custom words.
265 ///
266 /// # Returns
267 ///
268 /// A new `Tokenizer` instance.
269 #[new]
270 #[pyo3(signature = (dictionary, mode="normal", user_dictionary=None))]
271 fn new(
272 dictionary: PyDictionary,
273 mode: &str,
274 user_dictionary: Option<PyUserDictionary>,
275 ) -> PyResult<Self> {
276 let m = Mode::from_str(mode)
277 .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
278
279 let dict = dictionary.inner;
280 let user_dict = user_dictionary.map(|d| d.inner);
281
282 let segmenter = Segmenter::new(m, dict, user_dict);
283 let tokenizer = Tokenizer::new(segmenter);
284
285 Ok(Self { inner: tokenizer })
286 }
287
288 /// Tokenizes the given text.
289 ///
290 /// # Arguments
291 ///
292 /// * `text` - Text to tokenize.
293 ///
294 /// # Returns
295 ///
296 /// A list of token dictionaries containing morphological features.
297 ///
298 /// # Errors
299 ///
300 /// Returns an error if tokenization fails.
301 #[pyo3(signature = (text))]
302 fn tokenize(&self, py: Python<'_>, text: &str) -> PyResult<Vec<Py<PyAny>>> {
303 // Tokenize the processed text
304 let mut tokens = self
305 .inner
306 .tokenize(text)
307 .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {err}")))?;
308
309 // Convert to Python dictionaries
310 let py_tokens: Vec<Py<PyAny>> = tokens
311 .iter_mut()
312 .map(|t| {
313 let v = t.as_value();
314 value_to_pydict(py, &v).map_err(|err| {
315 PyValueError::new_err(format!("Failed to convert token to dict: {err}"))
316 })
317 })
318 .collect::<Result<Vec<_>, _>>()?;
319
320 Ok(py_tokens)
321 }
322}