lindera_py/
tokenizer.rs

1use std::path::Path;
2use std::str::FromStr;
3
4use pyo3::exceptions::PyValueError;
5use pyo3::prelude::*;
6use pyo3::types::PyDict;
7use serde_json::json;
8
9use lindera::character_filter::CharacterFilterLoader;
10use lindera::dictionary::DictionaryKind;
11use lindera::mode::Mode;
12use lindera::token_filter::TokenFilterLoader;
13use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
14
15use crate::segmenter::PySegmenter;
16use crate::token::PyToken;
17use crate::util::pydict_to_value;
18
19#[pyclass(name = "TokenizerBuilder")]
20pub struct PyTokenizerBuilder {
21    pub inner: TokenizerBuilder,
22}
23
24#[pymethods]
25impl PyTokenizerBuilder {
26    #[new]
27    #[pyo3(signature = ())]
28    fn new() -> PyResult<Self> {
29        let inner = TokenizerBuilder::new().map_err(|err| {
30            PyValueError::new_err(format!("Failed to create TokenizerBuilder: {err}"))
31        })?;
32
33        Ok(Self { inner })
34    }
35
36    #[pyo3(signature = (file_path))]
37    #[allow(clippy::wrong_self_convention)]
38    fn from_file(&self, file_path: &str) -> PyResult<Self> {
39        let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| {
40            PyValueError::new_err(format!("Failed to load config from file: {err}"))
41        })?;
42
43        Ok(Self { inner })
44    }
45
46    #[pyo3(signature = (mode))]
47    fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult<PyRefMut<'a, Self>> {
48        let m = Mode::from_str(mode)
49            .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
50
51        slf.inner.set_segmenter_mode(&m);
52
53        Ok(slf)
54    }
55
56    #[pyo3(signature = (kind))]
57    fn set_dictionary_kind<'a>(
58        mut slf: PyRefMut<'a, Self>,
59        kind: &str,
60    ) -> PyResult<PyRefMut<'a, Self>> {
61        let k = DictionaryKind::from_str(kind)
62            .map_err(|err| PyValueError::new_err(format!("Failed to create kind: {err}")))?;
63
64        slf.inner.set_segmenter_dictionary_kind(&k);
65
66        Ok(slf)
67    }
68
69    #[pyo3(signature = (path))]
70    fn set_dictionary_path<'a>(
71        mut slf: PyRefMut<'a, Self>,
72        path: &str,
73    ) -> PyResult<PyRefMut<'a, Self>> {
74        slf.inner.set_segmenter_dictionary_path(Path::new(path));
75
76        Ok(slf)
77    }
78
79    #[pyo3(signature = (path))]
80    fn set_user_dictionary_path<'a>(
81        mut slf: PyRefMut<'a, Self>,
82        path: &str,
83    ) -> PyResult<PyRefMut<'a, Self>> {
84        slf.inner
85            .set_segmenter_user_dictionary_path(Path::new(path));
86
87        Ok(slf)
88    }
89
90    #[pyo3(signature = (kind))]
91    fn set_user_dictionary_kind<'a>(
92        mut slf: PyRefMut<'a, Self>,
93        kind: &str,
94    ) -> PyResult<PyRefMut<'a, Self>> {
95        let k = DictionaryKind::from_str(kind)
96            .map_err(|err| PyValueError::new_err(format!("Failed to create kind: {err}")))?;
97
98        slf.inner.set_segmenter_user_dictionary_kind(&k);
99
100        Ok(slf)
101    }
102
103    #[pyo3(signature = (name, **args))]
104    fn append_character_filter<'a>(
105        mut slf: PyRefMut<'a, Self>,
106        name: &str,
107        args: Option<&Bound<'_, PyDict>>,
108    ) -> PyResult<PyRefMut<'a, Self>> {
109        let character_filter_args = match args {
110            Some(a) => pydict_to_value(a)?,
111            None => json!({}),
112        };
113
114        slf.inner
115            .append_character_filter(name, &character_filter_args);
116
117        Ok(slf)
118    }
119
120    #[pyo3(signature = (name, **args))]
121    fn append_token_filter<'a>(
122        mut slf: PyRefMut<'a, Self>,
123        name: &str,
124        args: Option<&Bound<'_, PyDict>>,
125    ) -> PyResult<PyRefMut<'a, Self>> {
126        let token_filter_args = match args {
127            Some(a) => pydict_to_value(a)?,
128            None => json!({}),
129        };
130
131        slf.inner.append_token_filter(name, &token_filter_args);
132
133        Ok(slf)
134    }
135
136    #[pyo3(signature = ())]
137    fn build(&self) -> PyResult<PyTokenizer> {
138        self.inner
139            .build()
140            .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {err}")))
141            .map(|t| PyTokenizer { inner: t })
142    }
143}
144
145#[pyclass(name = "Tokenizer")]
146pub struct PyTokenizer {
147    inner: Tokenizer,
148}
149
150#[pymethods]
151impl PyTokenizer {
152    #[new]
153    #[pyo3(signature = (segmenter))]
154    fn new(segmenter: PySegmenter) -> PyResult<Self> {
155        Ok(Self {
156            inner: Tokenizer::new(segmenter.inner),
157        })
158    }
159
160    #[pyo3(signature = (config))]
161    #[allow(clippy::wrong_self_convention)]
162    fn from_config(&self, config: &Bound<'_, PyDict>) -> PyResult<Self> {
163        let config_value = pydict_to_value(config)?;
164        let tokenizer = Tokenizer::from_config(&config_value)
165            .map_err(|err| PyValueError::new_err(format!("Failed to create tokenizer: {err}")))?;
166
167        Ok(Self { inner: tokenizer })
168    }
169
170    #[pyo3(signature = (name, **args))]
171    fn append_character_filter(
172        &mut self,
173        name: &str,
174        args: Option<&Bound<'_, PyDict>>,
175    ) -> PyResult<()> {
176        let value = match args {
177            Some(pydict) => pydict_to_value(pydict)?,
178            None => json!({}),
179        };
180
181        let filter = CharacterFilterLoader::load_from_value(name, &value).map_err(|err| {
182            PyValueError::new_err(format!("Failed to load character filter: {err}"))
183        })?;
184        self.inner.append_character_filter(filter);
185
186        Ok(())
187    }
188
189    #[pyo3(signature = (name, **args))]
190    fn append_token_filter(
191        &mut self,
192        name: &str,
193        args: Option<&Bound<'_, PyDict>>,
194    ) -> PyResult<()> {
195        let value = match args {
196            Some(pydict) => pydict_to_value(pydict)?,
197            None => json!({}),
198        };
199
200        let filter = TokenFilterLoader::load_from_value(name, &value)
201            .map_err(|err| PyValueError::new_err(format!("Failed to load token filter: {err}")))?;
202        self.inner.append_token_filter(filter);
203
204        Ok(())
205    }
206
207    #[pyo3(signature = (text))]
208    fn tokenize(&self, text: &str) -> PyResult<Vec<PyToken>> {
209        let mut tokens = self
210            .inner
211            .tokenize(text)
212            .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {err}")))?;
213
214        Ok(tokens
215            .iter_mut()
216            .map(|t| PyToken {
217                #[allow(clippy::suspicious_to_owned)]
218                text: t.text.to_owned().to_string(),
219                byte_start: t.byte_start,
220                byte_end: t.byte_end,
221                position: t.position,
222                position_length: t.position_length,
223                details: t.details().iter().map(|d| d.to_string()).collect(),
224            })
225            .collect())
226    }
227}