1use pyo3::exceptions::PyRuntimeError;
8use pyo3::prelude::*;
9use pyo3::types::{PyDict, PyList};
10
11use scirs2_numpy::{IntoPyArray, PyArray1, PyArray2, PyArrayMethods};
13
14use scirs2_text::{
16 cleansing::{
18 expand_contractions, normalize_unicode, normalize_whitespace, remove_accents,
19 replace_emails, replace_urls, strip_html_tags,
20 },
21 sentiment::{LexiconSentimentAnalyzer, Sentiment},
23 stemming::{LancasterStemmer, PorterStemmer, SnowballStemmer, Stemmer},
25 tokenize::{
27 CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
28 WhitespaceTokenizer, WordTokenizer,
29 },
30 vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer},
32};
33
34#[pyclass(name = "WordTokenizer")]
40pub struct PyWordTokenizer {
41 inner: WordTokenizer,
42}
43
44#[pymethods]
45impl PyWordTokenizer {
46 #[new]
47 #[pyo3(signature = (lowercase=true))]
48 fn new(lowercase: bool) -> Self {
49 Self {
50 inner: WordTokenizer::new(lowercase),
51 }
52 }
53
54 fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
55 self.inner
56 .tokenize(text)
57 .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
58 }
59
60 fn tokenize_batch(&self, texts: &Bound<'_, PyList>) -> PyResult<Vec<Vec<String>>> {
61 let texts_owned: Vec<String> = texts
62 .iter()
63 .map(|item| item.extract::<String>())
64 .collect::<PyResult<Vec<String>>>()?;
65 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
66 self.inner
67 .tokenize_batch(&text_strs)
68 .map_err(|e| PyRuntimeError::new_err(format!("Batch tokenization failed: {}", e)))
69 }
70}
71
72#[pyclass(name = "SentenceTokenizer")]
74pub struct PySentenceTokenizer {
75 inner: SentenceTokenizer,
76}
77
78#[pymethods]
79impl PySentenceTokenizer {
80 #[new]
81 fn new() -> Self {
82 Self {
83 inner: SentenceTokenizer::new(),
84 }
85 }
86
87 fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
88 self.inner
89 .tokenize(text)
90 .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
91 }
92}
93
94#[pyclass(name = "CharacterTokenizer")]
96pub struct PyCharacterTokenizer {
97 inner: CharacterTokenizer,
98}
99
100#[pymethods]
101impl PyCharacterTokenizer {
102 #[new]
103 #[pyo3(signature = (use_grapheme_clusters=true))]
104 fn new(use_grapheme_clusters: bool) -> Self {
105 Self {
106 inner: CharacterTokenizer::new(use_grapheme_clusters),
107 }
108 }
109
110 fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
111 self.inner
112 .tokenize(text)
113 .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
114 }
115}
116
117#[pyclass(name = "NgramTokenizer")]
119pub struct PyNgramTokenizer {
120 inner: NgramTokenizer,
121}
122
123#[pymethods]
124impl PyNgramTokenizer {
125 #[new]
126 #[pyo3(signature = (n=2))]
127 fn new(n: usize) -> PyResult<Self> {
128 let tokenizer = NgramTokenizer::new(n).map_err(|e| {
129 PyRuntimeError::new_err(format!("NgramTokenizer creation failed: {}", e))
130 })?;
131 Ok(Self { inner: tokenizer })
132 }
133
134 fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
135 self.inner
136 .tokenize(text)
137 .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
138 }
139}
140
141#[pyclass(name = "WhitespaceTokenizer")]
143pub struct PyWhitespaceTokenizer {
144 inner: WhitespaceTokenizer,
145}
146
147#[pymethods]
148impl PyWhitespaceTokenizer {
149 #[new]
150 fn new() -> Self {
151 Self {
152 inner: WhitespaceTokenizer::new(),
153 }
154 }
155
156 fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
157 self.inner
158 .tokenize(text)
159 .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
160 }
161}
162
163#[pyclass(name = "RegexTokenizer")]
165pub struct PyRegexTokenizer {
166 inner: RegexTokenizer,
167}
168
169#[pymethods]
170impl PyRegexTokenizer {
171 #[new]
172 #[pyo3(signature = (pattern, gaps=false))]
173 fn new(pattern: &str, gaps: bool) -> PyResult<Self> {
174 let tokenizer = RegexTokenizer::new(pattern, gaps).map_err(|e| {
175 PyRuntimeError::new_err(format!("RegexTokenizer creation failed: {}", e))
176 })?;
177 Ok(Self { inner: tokenizer })
178 }
179
180 fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
181 self.inner
182 .tokenize(text)
183 .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
184 }
185}
186
187#[pyclass(name = "CountVectorizer")]
193pub struct PyCountVectorizer {
194 inner: CountVectorizer,
195}
196
197#[pymethods]
198impl PyCountVectorizer {
199 #[new]
200 #[pyo3(signature = (binary=false))]
201 fn new(binary: bool) -> Self {
202 Self {
203 inner: CountVectorizer::new(binary),
204 }
205 }
206
207 fn fit(&mut self, texts: &Bound<'_, PyList>) -> PyResult<()> {
208 let texts_owned: Vec<String> = texts
209 .iter()
210 .map(|item| item.extract::<String>())
211 .collect::<PyResult<Vec<String>>>()?;
212 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
213 self.inner
214 .fit(&text_strs)
215 .map_err(|e| PyRuntimeError::new_err(format!("Fit failed: {}", e)))
216 }
217
218 fn transform(&self, py: Python, text: &str) -> PyResult<Py<PyArray1<f64>>> {
219 let result = self
220 .inner
221 .transform(text)
222 .map_err(|e| PyRuntimeError::new_err(format!("Transform failed: {}", e)))?;
223 Ok(result.into_pyarray(py).unbind())
224 }
225
226 fn transform_batch(
227 &self,
228 py: Python,
229 texts: &Bound<'_, PyList>,
230 ) -> PyResult<Py<PyArray2<f64>>> {
231 let texts_owned: Vec<String> = texts
232 .iter()
233 .map(|item| item.extract::<String>())
234 .collect::<PyResult<Vec<String>>>()?;
235 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
236 let result = self
237 .inner
238 .transform_batch(&text_strs)
239 .map_err(|e| PyRuntimeError::new_err(format!("Batch transform failed: {}", e)))?;
240 Ok(result.into_pyarray(py).unbind())
241 }
242
243 fn fit_transform(
244 &mut self,
245 py: Python,
246 texts: &Bound<'_, PyList>,
247 ) -> PyResult<Py<PyArray2<f64>>> {
248 let texts_owned: Vec<String> = texts
249 .iter()
250 .map(|item| item.extract::<String>())
251 .collect::<PyResult<Vec<String>>>()?;
252 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
253 let result = self
254 .inner
255 .fit_transform(&text_strs)
256 .map_err(|e| PyRuntimeError::new_err(format!("Fit transform failed: {}", e)))?;
257 Ok(result.into_pyarray(py).unbind())
258 }
259
260 fn vocabulary_size(&self) -> usize {
261 self.inner.vocabulary_size()
262 }
263
264 fn get_feature_names(&self) -> Vec<String> {
265 let vocab = self.inner.vocabulary();
266 let mut features: Vec<(usize, String)> = vocab
267 .token_to_index()
268 .iter()
269 .map(|(token, &idx)| (idx, token.clone()))
270 .collect();
271 features.sort_by_key(|(idx, _)| *idx);
272 features.into_iter().map(|(_, token)| token).collect()
273 }
274}
275
276#[pyclass(name = "TfidfVectorizer")]
278pub struct PyTfidfVectorizer {
279 inner: TfidfVectorizer,
280}
281
282#[pymethods]
283impl PyTfidfVectorizer {
284 #[new]
285 #[pyo3(signature = (lowercase=true, norm=true, norm_type=None))]
286 fn new(lowercase: bool, norm: bool, norm_type: Option<String>) -> Self {
287 Self {
288 inner: TfidfVectorizer::new(lowercase, norm, norm_type),
289 }
290 }
291
292 fn fit(&mut self, texts: &Bound<'_, PyList>) -> PyResult<()> {
293 let texts_owned: Vec<String> = texts
294 .iter()
295 .map(|item| item.extract::<String>())
296 .collect::<PyResult<Vec<String>>>()?;
297 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
298 self.inner
299 .fit(&text_strs)
300 .map_err(|e| PyRuntimeError::new_err(format!("Fit failed: {}", e)))
301 }
302
303 fn transform(&self, py: Python, text: &str) -> PyResult<Py<PyArray1<f64>>> {
304 let result = self
305 .inner
306 .transform(text)
307 .map_err(|e| PyRuntimeError::new_err(format!("Transform failed: {}", e)))?;
308 Ok(result.into_pyarray(py).unbind())
309 }
310
311 fn transform_batch(
312 &self,
313 py: Python,
314 texts: &Bound<'_, PyList>,
315 ) -> PyResult<Py<PyArray2<f64>>> {
316 let texts_owned: Vec<String> = texts
317 .iter()
318 .map(|item| item.extract::<String>())
319 .collect::<PyResult<Vec<String>>>()?;
320 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
321 let result = self
322 .inner
323 .transform_batch(&text_strs)
324 .map_err(|e| PyRuntimeError::new_err(format!("Batch transform failed: {}", e)))?;
325 Ok(result.into_pyarray(py).unbind())
326 }
327
328 fn fit_transform(
329 &mut self,
330 py: Python,
331 texts: &Bound<'_, PyList>,
332 ) -> PyResult<Py<PyArray2<f64>>> {
333 let texts_owned: Vec<String> = texts
334 .iter()
335 .map(|item| item.extract::<String>())
336 .collect::<PyResult<Vec<String>>>()?;
337 let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
338 let result = self
339 .inner
340 .fit_transform(&text_strs)
341 .map_err(|e| PyRuntimeError::new_err(format!("Fit transform failed: {}", e)))?;
342 Ok(result.into_pyarray(py).unbind())
343 }
344
345 fn vocabulary_size(&self) -> usize {
346 self.inner.vocabulary_size()
347 }
348
349 fn get_feature_names(&self) -> Vec<String> {
350 let vocab = self.inner.vocabulary();
351 let mut features: Vec<(usize, String)> = vocab
352 .token_to_index()
353 .iter()
354 .map(|(token, &idx)| (idx, token.clone()))
355 .collect();
356 features.sort_by_key(|(idx, _)| *idx);
357 features.into_iter().map(|(_, token)| token).collect()
358 }
359}
360
361fn sentiment_to_string(sentiment: &Sentiment) -> String {
367 match sentiment {
368 Sentiment::Positive => "positive".to_string(),
369 Sentiment::Negative => "negative".to_string(),
370 Sentiment::Neutral => "neutral".to_string(),
371 }
372}
373
374#[pyclass(name = "LexiconSentimentAnalyzer")]
376pub struct PyLexiconSentimentAnalyzer {
377 inner: LexiconSentimentAnalyzer,
378}
379
380#[pymethods]
381impl PyLexiconSentimentAnalyzer {
382 #[new]
383 fn new() -> Self {
384 Self {
385 inner: LexiconSentimentAnalyzer::with_basiclexicon(),
386 }
387 }
388
389 fn analyze(&self, py: Python, text: &str) -> PyResult<Py<PyAny>> {
390 let result = self
391 .inner
392 .analyze(text)
393 .map_err(|e| PyRuntimeError::new_err(format!("Sentiment analysis failed: {}", e)))?;
394
395 let dict = PyDict::new(py);
397 dict.set_item("sentiment", sentiment_to_string(&result.sentiment))?;
398 dict.set_item("score", result.score)?;
399 dict.set_item("confidence", result.confidence)?;
400
401 let word_counts = PyDict::new(py);
402 word_counts.set_item("positive_words", result.word_counts.positive_words)?;
403 word_counts.set_item("negative_words", result.word_counts.negative_words)?;
404 word_counts.set_item("neutral_words", result.word_counts.neutral_words)?;
405 word_counts.set_item("total_words", result.word_counts.total_words)?;
406 dict.set_item("word_counts", word_counts)?;
407
408 Ok(dict.into())
409 }
410}
411
412#[pyclass(name = "PorterStemmer")]
418pub struct PyPorterStemmer {
419 inner: PorterStemmer,
420}
421
422#[pymethods]
423impl PyPorterStemmer {
424 #[new]
425 fn new() -> Self {
426 Self {
427 inner: PorterStemmer::new(),
428 }
429 }
430
431 fn stem(&self, word: &str) -> PyResult<String> {
432 self.inner
433 .stem(word)
434 .map_err(|e| PyRuntimeError::new_err(format!("Stemming failed: {}", e)))
435 }
436
437 fn stem_batch(&self, words: &Bound<'_, PyList>) -> PyResult<Vec<String>> {
438 let words_owned: Vec<String> = words
439 .iter()
440 .map(|item| item.extract::<String>())
441 .collect::<PyResult<Vec<String>>>()?;
442 let word_strs: Vec<&str> = words_owned.iter().map(|s| s.as_str()).collect();
443 self.inner
444 .stem_batch(&word_strs)
445 .map_err(|e| PyRuntimeError::new_err(format!("Batch stemming failed: {}", e)))
446 }
447}
448
449#[pyclass(name = "SnowballStemmer")]
451pub struct PySnowballStemmer {
452 inner: SnowballStemmer,
453}
454
455#[pymethods]
456impl PySnowballStemmer {
457 #[new]
458 #[pyo3(signature = (language="english"))]
459 fn new(language: &str) -> PyResult<Self> {
460 let stemmer = SnowballStemmer::new(language).map_err(|e| {
461 PyRuntimeError::new_err(format!("SnowballStemmer creation failed: {}", e))
462 })?;
463 Ok(Self { inner: stemmer })
464 }
465
466 fn stem(&self, word: &str) -> PyResult<String> {
467 self.inner
468 .stem(word)
469 .map_err(|e| PyRuntimeError::new_err(format!("Stemming failed: {}", e)))
470 }
471
472 fn stem_batch(&self, words: &Bound<'_, PyList>) -> PyResult<Vec<String>> {
473 let words_owned: Vec<String> = words
474 .iter()
475 .map(|item| item.extract::<String>())
476 .collect::<PyResult<Vec<String>>>()?;
477 let word_strs: Vec<&str> = words_owned.iter().map(|s| s.as_str()).collect();
478 self.inner
479 .stem_batch(&word_strs)
480 .map_err(|e| PyRuntimeError::new_err(format!("Batch stemming failed: {}", e)))
481 }
482}
483
484#[pyclass(name = "LancasterStemmer")]
486pub struct PyLancasterStemmer {
487 inner: LancasterStemmer,
488}
489
490#[pymethods]
491impl PyLancasterStemmer {
492 #[new]
493 fn new() -> Self {
494 Self {
495 inner: LancasterStemmer::new(),
496 }
497 }
498
499 fn stem(&self, word: &str) -> PyResult<String> {
500 self.inner
501 .stem(word)
502 .map_err(|e| PyRuntimeError::new_err(format!("Stemming failed: {}", e)))
503 }
504
505 fn stem_batch(&self, words: &Bound<'_, PyList>) -> PyResult<Vec<String>> {
506 let words_owned: Vec<String> = words
507 .iter()
508 .map(|item| item.extract::<String>())
509 .collect::<PyResult<Vec<String>>>()?;
510 let word_strs: Vec<&str> = words_owned.iter().map(|s| s.as_str()).collect();
511 self.inner
512 .stem_batch(&word_strs)
513 .map_err(|e| PyRuntimeError::new_err(format!("Batch stemming failed: {}", e)))
514 }
515}
516
517#[pyfunction]
523fn levenshtein_distance_py(s1: &str, s2: &str) -> usize {
524 scirs2_text::distance::levenshtein_distance(s1, s2)
525}
526
527#[pyfunction]
529fn cosine_similarity_py(
530 vec1: &Bound<'_, PyArray1<f64>>,
531 vec2: &Bound<'_, PyArray1<f64>>,
532) -> PyResult<f64> {
533 let v1_binding = vec1.readonly();
534 let v2_binding = vec2.readonly();
535 let v1_view = v1_binding.as_array();
536 let v2_view = v2_binding.as_array();
537
538 scirs2_text::distance::cosine_similarity(v1_view, v2_view)
539 .map_err(|e| PyRuntimeError::new_err(format!("Similarity calculation failed: {}", e)))
540}
541
542#[pyfunction]
544fn jaccard_similarity_py(s1: &str, s2: &str) -> PyResult<f64> {
545 scirs2_text::distance::jaccard_similarity(s1, s2, None)
546 .map_err(|e| PyRuntimeError::new_err(format!("Similarity calculation failed: {}", e)))
547}
548
549#[pyfunction]
555fn strip_html_tags_py(text: &str) -> String {
556 strip_html_tags(text)
557}
558
559#[pyfunction]
561#[pyo3(signature = (text, replacement="<URL>"))]
562fn replace_urls_py(text: &str, replacement: &str) -> String {
563 replace_urls(text, replacement)
564}
565
566#[pyfunction]
568#[pyo3(signature = (text, replacement="<EMAIL>"))]
569fn replace_emails_py(text: &str, replacement: &str) -> String {
570 replace_emails(text, replacement)
571}
572
573#[pyfunction]
575fn expand_contractions_py(text: &str) -> String {
576 expand_contractions(text)
577}
578
579#[pyfunction]
581fn normalize_unicode_py(text: &str) -> PyResult<String> {
582 normalize_unicode(text)
583 .map_err(|e| PyRuntimeError::new_err(format!("Unicode normalization failed: {}", e)))
584}
585
586#[pyfunction]
588fn normalize_whitespace_py(text: &str) -> String {
589 normalize_whitespace(text)
590}
591
592#[pyfunction]
594fn remove_accents_py(text: &str) -> String {
595 remove_accents(text)
596}
597
598pub fn register_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
600 m.add_class::<PyWordTokenizer>()?;
602 m.add_class::<PySentenceTokenizer>()?;
603 m.add_class::<PyCharacterTokenizer>()?;
604 m.add_class::<PyNgramTokenizer>()?;
605 m.add_class::<PyWhitespaceTokenizer>()?;
606 m.add_class::<PyRegexTokenizer>()?;
607
608 m.add_class::<PyCountVectorizer>()?;
610 m.add_class::<PyTfidfVectorizer>()?;
611
612 m.add_class::<PyLexiconSentimentAnalyzer>()?;
614
615 m.add_class::<PyPorterStemmer>()?;
617 m.add_class::<PySnowballStemmer>()?;
618 m.add_class::<PyLancasterStemmer>()?;
619
620 m.add_function(wrap_pyfunction!(levenshtein_distance_py, m)?)?;
622 m.add_function(wrap_pyfunction!(cosine_similarity_py, m)?)?;
623 m.add_function(wrap_pyfunction!(jaccard_similarity_py, m)?)?;
624
625 m.add_function(wrap_pyfunction!(strip_html_tags_py, m)?)?;
627 m.add_function(wrap_pyfunction!(replace_urls_py, m)?)?;
628 m.add_function(wrap_pyfunction!(replace_emails_py, m)?)?;
629 m.add_function(wrap_pyfunction!(expand_contractions_py, m)?)?;
630 m.add_function(wrap_pyfunction!(normalize_unicode_py, m)?)?;
631 m.add_function(wrap_pyfunction!(normalize_whitespace_py, m)?)?;
632 m.add_function(wrap_pyfunction!(remove_accents_py, m)?)?;
633
634 Ok(())
635}