1use std::path::Path;
2use std::str::FromStr;
3
4use pyo3::exceptions::PyValueError;
5use pyo3::prelude::*;
6use pyo3::types::PyDict;
7use serde_json::json;
8
9use lindera::character_filter::CharacterFilterLoader;
10use lindera::dictionary::DictionaryKind;
11use lindera::mode::Mode;
12use lindera::token_filter::TokenFilterLoader;
13use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
14
15use crate::segmenter::PySegmenter;
16use crate::token::PyToken;
17use crate::util::pydict_to_value;
18
19#[pyclass(name = "TokenizerBuilder")]
20pub struct PyTokenizerBuilder {
21 pub inner: TokenizerBuilder,
22}
23
24#[pymethods]
25impl PyTokenizerBuilder {
26 #[new]
27 #[pyo3(signature = ())]
28 fn new() -> PyResult<Self> {
29 let inner = TokenizerBuilder::new().map_err(|err| {
30 PyValueError::new_err(format!("Failed to create TokenizerBuilder: {err}"))
31 })?;
32
33 Ok(Self { inner })
34 }
35
36 #[pyo3(signature = (file_path))]
37 #[allow(clippy::wrong_self_convention)]
38 fn from_file(&self, file_path: &str) -> PyResult<Self> {
39 let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| {
40 PyValueError::new_err(format!("Failed to load config from file: {err}"))
41 })?;
42
43 Ok(Self { inner })
44 }
45
46 #[pyo3(signature = (mode))]
47 fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult<PyRefMut<'a, Self>> {
48 let m = Mode::from_str(mode)
49 .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {err}")))?;
50
51 slf.inner.set_segmenter_mode(&m);
52
53 Ok(slf)
54 }
55
56 #[pyo3(signature = (kind))]
57 fn set_dictionary_kind<'a>(
58 mut slf: PyRefMut<'a, Self>,
59 kind: &str,
60 ) -> PyResult<PyRefMut<'a, Self>> {
61 let k = DictionaryKind::from_str(kind)
62 .map_err(|err| PyValueError::new_err(format!("Failed to create kind: {err}")))?;
63
64 slf.inner.set_segmenter_dictionary_kind(&k);
65
66 Ok(slf)
67 }
68
69 #[pyo3(signature = (path))]
70 fn set_dictionary_path<'a>(
71 mut slf: PyRefMut<'a, Self>,
72 path: &str,
73 ) -> PyResult<PyRefMut<'a, Self>> {
74 slf.inner.set_segmenter_dictionary_path(Path::new(path));
75
76 Ok(slf)
77 }
78
79 #[pyo3(signature = (path))]
80 fn set_user_dictionary_path<'a>(
81 mut slf: PyRefMut<'a, Self>,
82 path: &str,
83 ) -> PyResult<PyRefMut<'a, Self>> {
84 slf.inner
85 .set_segmenter_user_dictionary_path(Path::new(path));
86
87 Ok(slf)
88 }
89
90 #[pyo3(signature = (kind))]
91 fn set_user_dictionary_kind<'a>(
92 mut slf: PyRefMut<'a, Self>,
93 kind: &str,
94 ) -> PyResult<PyRefMut<'a, Self>> {
95 let k = DictionaryKind::from_str(kind)
96 .map_err(|err| PyValueError::new_err(format!("Failed to create kind: {err}")))?;
97
98 slf.inner.set_segmenter_user_dictionary_kind(&k);
99
100 Ok(slf)
101 }
102
103 #[pyo3(signature = (name, **args))]
104 fn append_character_filter<'a>(
105 mut slf: PyRefMut<'a, Self>,
106 name: &str,
107 args: Option<&Bound<'_, PyDict>>,
108 ) -> PyResult<PyRefMut<'a, Self>> {
109 let character_filter_args = match args {
110 Some(a) => pydict_to_value(a)?,
111 None => json!({}),
112 };
113
114 slf.inner
115 .append_character_filter(name, &character_filter_args);
116
117 Ok(slf)
118 }
119
120 #[pyo3(signature = (name, **args))]
121 fn append_token_filter<'a>(
122 mut slf: PyRefMut<'a, Self>,
123 name: &str,
124 args: Option<&Bound<'_, PyDict>>,
125 ) -> PyResult<PyRefMut<'a, Self>> {
126 let token_filter_args = match args {
127 Some(a) => pydict_to_value(a)?,
128 None => json!({}),
129 };
130
131 slf.inner.append_token_filter(name, &token_filter_args);
132
133 Ok(slf)
134 }
135
136 #[pyo3(signature = ())]
137 fn build(&self) -> PyResult<PyTokenizer> {
138 self.inner
139 .build()
140 .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {err}")))
141 .map(|t| PyTokenizer { inner: t })
142 }
143}
144
145#[pyclass(name = "Tokenizer")]
146pub struct PyTokenizer {
147 inner: Tokenizer,
148}
149
150#[pymethods]
151impl PyTokenizer {
152 #[new]
153 #[pyo3(signature = (segmenter))]
154 fn new(segmenter: PySegmenter) -> PyResult<Self> {
155 Ok(Self {
156 inner: Tokenizer::new(segmenter.inner),
157 })
158 }
159
160 #[pyo3(signature = (config))]
161 #[allow(clippy::wrong_self_convention)]
162 fn from_config(&self, config: &Bound<'_, PyDict>) -> PyResult<Self> {
163 let config_value = pydict_to_value(config)?;
164 let tokenizer = Tokenizer::from_config(&config_value)
165 .map_err(|err| PyValueError::new_err(format!("Failed to create tokenizer: {err}")))?;
166
167 Ok(Self { inner: tokenizer })
168 }
169
170 #[pyo3(signature = (name, **args))]
171 fn append_character_filter(
172 &mut self,
173 name: &str,
174 args: Option<&Bound<'_, PyDict>>,
175 ) -> PyResult<()> {
176 let value = match args {
177 Some(pydict) => pydict_to_value(pydict)?,
178 None => json!({}),
179 };
180
181 let filter = CharacterFilterLoader::load_from_value(name, &value).map_err(|err| {
182 PyValueError::new_err(format!("Failed to load character filter: {err}"))
183 })?;
184 self.inner.append_character_filter(filter);
185
186 Ok(())
187 }
188
189 #[pyo3(signature = (name, **args))]
190 fn append_token_filter(
191 &mut self,
192 name: &str,
193 args: Option<&Bound<'_, PyDict>>,
194 ) -> PyResult<()> {
195 let value = match args {
196 Some(pydict) => pydict_to_value(pydict)?,
197 None => json!({}),
198 };
199
200 let filter = TokenFilterLoader::load_from_value(name, &value)
201 .map_err(|err| PyValueError::new_err(format!("Failed to load token filter: {err}")))?;
202 self.inner.append_token_filter(filter);
203
204 Ok(())
205 }
206
207 #[pyo3(signature = (text))]
208 fn tokenize(&self, text: &str) -> PyResult<Vec<PyToken>> {
209 let mut tokens = self
210 .inner
211 .tokenize(text)
212 .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {err}")))?;
213
214 Ok(tokens
215 .iter_mut()
216 .map(|t| PyToken {
217 #[allow(clippy::suspicious_to_owned)]
218 text: t.text.to_owned().to_string(),
219 byte_start: t.byte_start,
220 byte_end: t.byte_end,
221 position: t.position,
222 position_length: t.position_length,
223 details: t.details().iter().map(|d| d.to_string()).collect(),
224 })
225 .collect())
226 }
227}