Skip to main content

scirs2/
text.rs

1//! Python bindings for scirs2-text
2//!
3//! This module provides Python bindings for text processing operations,
4//! including tokenization, vectorization, sentiment analysis, stemming,
5//! string similarity metrics, and text cleaning.
6
7use pyo3::exceptions::PyRuntimeError;
8use pyo3::prelude::*;
9use pyo3::types::{PyDict, PyList};
10
11// NumPy types for Python array interface
12use scirs2_numpy::{IntoPyArray, PyArray1, PyArray2, PyArrayMethods};
13
14// Direct imports from scirs2-text
15use scirs2_text::{
16    // Cleansing functions
17    cleansing::{
18        expand_contractions, normalize_unicode, normalize_whitespace, remove_accents,
19        replace_emails, replace_urls, strip_html_tags,
20    },
21    // Sentiment
22    sentiment::{LexiconSentimentAnalyzer, Sentiment},
23    // Stemming
24    stemming::{LancasterStemmer, PorterStemmer, SnowballStemmer, Stemmer},
25    // Tokenization
26    tokenize::{
27        CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
28        WhitespaceTokenizer, WordTokenizer,
29    },
30    // Vectorization
31    vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer},
32};
33
34// ========================================
35// TOKENIZATION
36// ========================================
37
38/// Word tokenizer
39#[pyclass(name = "WordTokenizer")]
40pub struct PyWordTokenizer {
41    inner: WordTokenizer,
42}
43
44#[pymethods]
45impl PyWordTokenizer {
46    #[new]
47    #[pyo3(signature = (lowercase=true))]
48    fn new(lowercase: bool) -> Self {
49        Self {
50            inner: WordTokenizer::new(lowercase),
51        }
52    }
53
54    fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
55        self.inner
56            .tokenize(text)
57            .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
58    }
59
60    fn tokenize_batch(&self, texts: &Bound<'_, PyList>) -> PyResult<Vec<Vec<String>>> {
61        let texts_owned: Vec<String> = texts
62            .iter()
63            .map(|item| item.extract::<String>())
64            .collect::<PyResult<Vec<String>>>()?;
65        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
66        self.inner
67            .tokenize_batch(&text_strs)
68            .map_err(|e| PyRuntimeError::new_err(format!("Batch tokenization failed: {}", e)))
69    }
70}
71
72/// Sentence tokenizer
73#[pyclass(name = "SentenceTokenizer")]
74pub struct PySentenceTokenizer {
75    inner: SentenceTokenizer,
76}
77
78#[pymethods]
79impl PySentenceTokenizer {
80    #[new]
81    fn new() -> Self {
82        Self {
83            inner: SentenceTokenizer::new(),
84        }
85    }
86
87    fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
88        self.inner
89            .tokenize(text)
90            .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
91    }
92}
93
94/// Character tokenizer
95#[pyclass(name = "CharacterTokenizer")]
96pub struct PyCharacterTokenizer {
97    inner: CharacterTokenizer,
98}
99
100#[pymethods]
101impl PyCharacterTokenizer {
102    #[new]
103    #[pyo3(signature = (use_grapheme_clusters=true))]
104    fn new(use_grapheme_clusters: bool) -> Self {
105        Self {
106            inner: CharacterTokenizer::new(use_grapheme_clusters),
107        }
108    }
109
110    fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
111        self.inner
112            .tokenize(text)
113            .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
114    }
115}
116
117/// N-gram tokenizer
118#[pyclass(name = "NgramTokenizer")]
119pub struct PyNgramTokenizer {
120    inner: NgramTokenizer,
121}
122
123#[pymethods]
124impl PyNgramTokenizer {
125    #[new]
126    #[pyo3(signature = (n=2))]
127    fn new(n: usize) -> PyResult<Self> {
128        let tokenizer = NgramTokenizer::new(n).map_err(|e| {
129            PyRuntimeError::new_err(format!("NgramTokenizer creation failed: {}", e))
130        })?;
131        Ok(Self { inner: tokenizer })
132    }
133
134    fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
135        self.inner
136            .tokenize(text)
137            .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
138    }
139}
140
141/// Whitespace tokenizer
142#[pyclass(name = "WhitespaceTokenizer")]
143pub struct PyWhitespaceTokenizer {
144    inner: WhitespaceTokenizer,
145}
146
147#[pymethods]
148impl PyWhitespaceTokenizer {
149    #[new]
150    fn new() -> Self {
151        Self {
152            inner: WhitespaceTokenizer::new(),
153        }
154    }
155
156    fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
157        self.inner
158            .tokenize(text)
159            .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
160    }
161}
162
163/// Regex tokenizer
164#[pyclass(name = "RegexTokenizer")]
165pub struct PyRegexTokenizer {
166    inner: RegexTokenizer,
167}
168
169#[pymethods]
170impl PyRegexTokenizer {
171    #[new]
172    #[pyo3(signature = (pattern, gaps=false))]
173    fn new(pattern: &str, gaps: bool) -> PyResult<Self> {
174        let tokenizer = RegexTokenizer::new(pattern, gaps).map_err(|e| {
175            PyRuntimeError::new_err(format!("RegexTokenizer creation failed: {}", e))
176        })?;
177        Ok(Self { inner: tokenizer })
178    }
179
180    fn tokenize(&self, text: &str) -> PyResult<Vec<String>> {
181        self.inner
182            .tokenize(text)
183            .map_err(|e| PyRuntimeError::new_err(format!("Tokenization failed: {}", e)))
184    }
185}
186
187// ========================================
188// VECTORIZATION
189// ========================================
190
191/// Count vectorizer (bag-of-words)
192#[pyclass(name = "CountVectorizer")]
193pub struct PyCountVectorizer {
194    inner: CountVectorizer,
195}
196
197#[pymethods]
198impl PyCountVectorizer {
199    #[new]
200    #[pyo3(signature = (binary=false))]
201    fn new(binary: bool) -> Self {
202        Self {
203            inner: CountVectorizer::new(binary),
204        }
205    }
206
207    fn fit(&mut self, texts: &Bound<'_, PyList>) -> PyResult<()> {
208        let texts_owned: Vec<String> = texts
209            .iter()
210            .map(|item| item.extract::<String>())
211            .collect::<PyResult<Vec<String>>>()?;
212        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
213        self.inner
214            .fit(&text_strs)
215            .map_err(|e| PyRuntimeError::new_err(format!("Fit failed: {}", e)))
216    }
217
218    fn transform(&self, py: Python, text: &str) -> PyResult<Py<PyArray1<f64>>> {
219        let result = self
220            .inner
221            .transform(text)
222            .map_err(|e| PyRuntimeError::new_err(format!("Transform failed: {}", e)))?;
223        Ok(result.into_pyarray(py).unbind())
224    }
225
226    fn transform_batch(
227        &self,
228        py: Python,
229        texts: &Bound<'_, PyList>,
230    ) -> PyResult<Py<PyArray2<f64>>> {
231        let texts_owned: Vec<String> = texts
232            .iter()
233            .map(|item| item.extract::<String>())
234            .collect::<PyResult<Vec<String>>>()?;
235        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
236        let result = self
237            .inner
238            .transform_batch(&text_strs)
239            .map_err(|e| PyRuntimeError::new_err(format!("Batch transform failed: {}", e)))?;
240        Ok(result.into_pyarray(py).unbind())
241    }
242
243    fn fit_transform(
244        &mut self,
245        py: Python,
246        texts: &Bound<'_, PyList>,
247    ) -> PyResult<Py<PyArray2<f64>>> {
248        let texts_owned: Vec<String> = texts
249            .iter()
250            .map(|item| item.extract::<String>())
251            .collect::<PyResult<Vec<String>>>()?;
252        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
253        let result = self
254            .inner
255            .fit_transform(&text_strs)
256            .map_err(|e| PyRuntimeError::new_err(format!("Fit transform failed: {}", e)))?;
257        Ok(result.into_pyarray(py).unbind())
258    }
259
260    fn vocabulary_size(&self) -> usize {
261        self.inner.vocabulary_size()
262    }
263
264    fn get_feature_names(&self) -> Vec<String> {
265        let vocab = self.inner.vocabulary();
266        let mut features: Vec<(usize, String)> = vocab
267            .token_to_index()
268            .iter()
269            .map(|(token, &idx)| (idx, token.clone()))
270            .collect();
271        features.sort_by_key(|(idx, _)| *idx);
272        features.into_iter().map(|(_, token)| token).collect()
273    }
274}
275
276/// TF-IDF vectorizer
277#[pyclass(name = "TfidfVectorizer")]
278pub struct PyTfidfVectorizer {
279    inner: TfidfVectorizer,
280}
281
282#[pymethods]
283impl PyTfidfVectorizer {
284    #[new]
285    #[pyo3(signature = (lowercase=true, norm=true, norm_type=None))]
286    fn new(lowercase: bool, norm: bool, norm_type: Option<String>) -> Self {
287        Self {
288            inner: TfidfVectorizer::new(lowercase, norm, norm_type),
289        }
290    }
291
292    fn fit(&mut self, texts: &Bound<'_, PyList>) -> PyResult<()> {
293        let texts_owned: Vec<String> = texts
294            .iter()
295            .map(|item| item.extract::<String>())
296            .collect::<PyResult<Vec<String>>>()?;
297        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
298        self.inner
299            .fit(&text_strs)
300            .map_err(|e| PyRuntimeError::new_err(format!("Fit failed: {}", e)))
301    }
302
303    fn transform(&self, py: Python, text: &str) -> PyResult<Py<PyArray1<f64>>> {
304        let result = self
305            .inner
306            .transform(text)
307            .map_err(|e| PyRuntimeError::new_err(format!("Transform failed: {}", e)))?;
308        Ok(result.into_pyarray(py).unbind())
309    }
310
311    fn transform_batch(
312        &self,
313        py: Python,
314        texts: &Bound<'_, PyList>,
315    ) -> PyResult<Py<PyArray2<f64>>> {
316        let texts_owned: Vec<String> = texts
317            .iter()
318            .map(|item| item.extract::<String>())
319            .collect::<PyResult<Vec<String>>>()?;
320        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
321        let result = self
322            .inner
323            .transform_batch(&text_strs)
324            .map_err(|e| PyRuntimeError::new_err(format!("Batch transform failed: {}", e)))?;
325        Ok(result.into_pyarray(py).unbind())
326    }
327
328    fn fit_transform(
329        &mut self,
330        py: Python,
331        texts: &Bound<'_, PyList>,
332    ) -> PyResult<Py<PyArray2<f64>>> {
333        let texts_owned: Vec<String> = texts
334            .iter()
335            .map(|item| item.extract::<String>())
336            .collect::<PyResult<Vec<String>>>()?;
337        let text_strs: Vec<&str> = texts_owned.iter().map(|s| s.as_str()).collect();
338        let result = self
339            .inner
340            .fit_transform(&text_strs)
341            .map_err(|e| PyRuntimeError::new_err(format!("Fit transform failed: {}", e)))?;
342        Ok(result.into_pyarray(py).unbind())
343    }
344
345    fn vocabulary_size(&self) -> usize {
346        self.inner.vocabulary_size()
347    }
348
349    fn get_feature_names(&self) -> Vec<String> {
350        let vocab = self.inner.vocabulary();
351        let mut features: Vec<(usize, String)> = vocab
352            .token_to_index()
353            .iter()
354            .map(|(token, &idx)| (idx, token.clone()))
355            .collect();
356        features.sort_by_key(|(idx, _)| *idx);
357        features.into_iter().map(|(_, token)| token).collect()
358    }
359}
360
361// ========================================
362// SENTIMENT ANALYSIS
363// ========================================
364
365/// Convert Sentiment enum to string for Python
366fn sentiment_to_string(sentiment: &Sentiment) -> String {
367    match sentiment {
368        Sentiment::Positive => "positive".to_string(),
369        Sentiment::Negative => "negative".to_string(),
370        Sentiment::Neutral => "neutral".to_string(),
371    }
372}
373
374/// Lexicon-based sentiment analyzer
375#[pyclass(name = "LexiconSentimentAnalyzer")]
376pub struct PyLexiconSentimentAnalyzer {
377    inner: LexiconSentimentAnalyzer,
378}
379
380#[pymethods]
381impl PyLexiconSentimentAnalyzer {
382    #[new]
383    fn new() -> Self {
384        Self {
385            inner: LexiconSentimentAnalyzer::with_basiclexicon(),
386        }
387    }
388
389    fn analyze(&self, py: Python, text: &str) -> PyResult<Py<PyAny>> {
390        let result = self
391            .inner
392            .analyze(text)
393            .map_err(|e| PyRuntimeError::new_err(format!("Sentiment analysis failed: {}", e)))?;
394
395        // Convert to Python dict
396        let dict = PyDict::new(py);
397        dict.set_item("sentiment", sentiment_to_string(&result.sentiment))?;
398        dict.set_item("score", result.score)?;
399        dict.set_item("confidence", result.confidence)?;
400
401        let word_counts = PyDict::new(py);
402        word_counts.set_item("positive_words", result.word_counts.positive_words)?;
403        word_counts.set_item("negative_words", result.word_counts.negative_words)?;
404        word_counts.set_item("neutral_words", result.word_counts.neutral_words)?;
405        word_counts.set_item("total_words", result.word_counts.total_words)?;
406        dict.set_item("word_counts", word_counts)?;
407
408        Ok(dict.into())
409    }
410}
411
412// ========================================
413// STEMMING
414// ========================================
415
416/// Porter stemmer
417#[pyclass(name = "PorterStemmer")]
418pub struct PyPorterStemmer {
419    inner: PorterStemmer,
420}
421
422#[pymethods]
423impl PyPorterStemmer {
424    #[new]
425    fn new() -> Self {
426        Self {
427            inner: PorterStemmer::new(),
428        }
429    }
430
431    fn stem(&self, word: &str) -> PyResult<String> {
432        self.inner
433            .stem(word)
434            .map_err(|e| PyRuntimeError::new_err(format!("Stemming failed: {}", e)))
435    }
436
437    fn stem_batch(&self, words: &Bound<'_, PyList>) -> PyResult<Vec<String>> {
438        let words_owned: Vec<String> = words
439            .iter()
440            .map(|item| item.extract::<String>())
441            .collect::<PyResult<Vec<String>>>()?;
442        let word_strs: Vec<&str> = words_owned.iter().map(|s| s.as_str()).collect();
443        self.inner
444            .stem_batch(&word_strs)
445            .map_err(|e| PyRuntimeError::new_err(format!("Batch stemming failed: {}", e)))
446    }
447}
448
449/// Snowball stemmer
450#[pyclass(name = "SnowballStemmer")]
451pub struct PySnowballStemmer {
452    inner: SnowballStemmer,
453}
454
455#[pymethods]
456impl PySnowballStemmer {
457    #[new]
458    #[pyo3(signature = (language="english"))]
459    fn new(language: &str) -> PyResult<Self> {
460        let stemmer = SnowballStemmer::new(language).map_err(|e| {
461            PyRuntimeError::new_err(format!("SnowballStemmer creation failed: {}", e))
462        })?;
463        Ok(Self { inner: stemmer })
464    }
465
466    fn stem(&self, word: &str) -> PyResult<String> {
467        self.inner
468            .stem(word)
469            .map_err(|e| PyRuntimeError::new_err(format!("Stemming failed: {}", e)))
470    }
471
472    fn stem_batch(&self, words: &Bound<'_, PyList>) -> PyResult<Vec<String>> {
473        let words_owned: Vec<String> = words
474            .iter()
475            .map(|item| item.extract::<String>())
476            .collect::<PyResult<Vec<String>>>()?;
477        let word_strs: Vec<&str> = words_owned.iter().map(|s| s.as_str()).collect();
478        self.inner
479            .stem_batch(&word_strs)
480            .map_err(|e| PyRuntimeError::new_err(format!("Batch stemming failed: {}", e)))
481    }
482}
483
484/// Lancaster stemmer
485#[pyclass(name = "LancasterStemmer")]
486pub struct PyLancasterStemmer {
487    inner: LancasterStemmer,
488}
489
490#[pymethods]
491impl PyLancasterStemmer {
492    #[new]
493    fn new() -> Self {
494        Self {
495            inner: LancasterStemmer::new(),
496        }
497    }
498
499    fn stem(&self, word: &str) -> PyResult<String> {
500        self.inner
501            .stem(word)
502            .map_err(|e| PyRuntimeError::new_err(format!("Stemming failed: {}", e)))
503    }
504
505    fn stem_batch(&self, words: &Bound<'_, PyList>) -> PyResult<Vec<String>> {
506        let words_owned: Vec<String> = words
507            .iter()
508            .map(|item| item.extract::<String>())
509            .collect::<PyResult<Vec<String>>>()?;
510        let word_strs: Vec<&str> = words_owned.iter().map(|s| s.as_str()).collect();
511        self.inner
512            .stem_batch(&word_strs)
513            .map_err(|e| PyRuntimeError::new_err(format!("Batch stemming failed: {}", e)))
514    }
515}
516
517// ========================================
518// STRING SIMILARITY METRICS
519// ========================================
520
521/// Levenshtein distance
522#[pyfunction]
523fn levenshtein_distance_py(s1: &str, s2: &str) -> usize {
524    scirs2_text::distance::levenshtein_distance(s1, s2)
525}
526
527/// Cosine similarity between two vectors
528#[pyfunction]
529fn cosine_similarity_py(
530    vec1: &Bound<'_, PyArray1<f64>>,
531    vec2: &Bound<'_, PyArray1<f64>>,
532) -> PyResult<f64> {
533    let v1_binding = vec1.readonly();
534    let v2_binding = vec2.readonly();
535    let v1_view = v1_binding.as_array();
536    let v2_view = v2_binding.as_array();
537
538    scirs2_text::distance::cosine_similarity(v1_view, v2_view)
539        .map_err(|e| PyRuntimeError::new_err(format!("Similarity calculation failed: {}", e)))
540}
541
542/// Jaccard similarity between two token sets
543#[pyfunction]
544fn jaccard_similarity_py(s1: &str, s2: &str) -> PyResult<f64> {
545    scirs2_text::distance::jaccard_similarity(s1, s2, None)
546        .map_err(|e| PyRuntimeError::new_err(format!("Similarity calculation failed: {}", e)))
547}
548
549// ========================================
550// TEXT CLEANING
551// ========================================
552
553/// Strip HTML tags
554#[pyfunction]
555fn strip_html_tags_py(text: &str) -> String {
556    strip_html_tags(text)
557}
558
559/// Replace URLs with a replacement string
560#[pyfunction]
561#[pyo3(signature = (text, replacement="<URL>"))]
562fn replace_urls_py(text: &str, replacement: &str) -> String {
563    replace_urls(text, replacement)
564}
565
566/// Replace emails with a replacement string
567#[pyfunction]
568#[pyo3(signature = (text, replacement="<EMAIL>"))]
569fn replace_emails_py(text: &str, replacement: &str) -> String {
570    replace_emails(text, replacement)
571}
572
573/// Expand contractions
574#[pyfunction]
575fn expand_contractions_py(text: &str) -> String {
576    expand_contractions(text)
577}
578
579/// Normalize Unicode
580#[pyfunction]
581fn normalize_unicode_py(text: &str) -> PyResult<String> {
582    normalize_unicode(text)
583        .map_err(|e| PyRuntimeError::new_err(format!("Unicode normalization failed: {}", e)))
584}
585
586/// Normalize whitespace
587#[pyfunction]
588fn normalize_whitespace_py(text: &str) -> String {
589    normalize_whitespace(text)
590}
591
592/// Remove accents
593#[pyfunction]
594fn remove_accents_py(text: &str) -> String {
595    remove_accents(text)
596}
597
598/// Python module registration
599pub fn register_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
600    // Tokenization
601    m.add_class::<PyWordTokenizer>()?;
602    m.add_class::<PySentenceTokenizer>()?;
603    m.add_class::<PyCharacterTokenizer>()?;
604    m.add_class::<PyNgramTokenizer>()?;
605    m.add_class::<PyWhitespaceTokenizer>()?;
606    m.add_class::<PyRegexTokenizer>()?;
607
608    // Vectorization
609    m.add_class::<PyCountVectorizer>()?;
610    m.add_class::<PyTfidfVectorizer>()?;
611
612    // Sentiment analysis
613    m.add_class::<PyLexiconSentimentAnalyzer>()?;
614
615    // Stemming
616    m.add_class::<PyPorterStemmer>()?;
617    m.add_class::<PySnowballStemmer>()?;
618    m.add_class::<PyLancasterStemmer>()?;
619
620    // String similarity metrics
621    m.add_function(wrap_pyfunction!(levenshtein_distance_py, m)?)?;
622    m.add_function(wrap_pyfunction!(cosine_similarity_py, m)?)?;
623    m.add_function(wrap_pyfunction!(jaccard_similarity_py, m)?)?;
624
625    // Text cleaning
626    m.add_function(wrap_pyfunction!(strip_html_tags_py, m)?)?;
627    m.add_function(wrap_pyfunction!(replace_urls_py, m)?)?;
628    m.add_function(wrap_pyfunction!(replace_emails_py, m)?)?;
629    m.add_function(wrap_pyfunction!(expand_contractions_py, m)?)?;
630    m.add_function(wrap_pyfunction!(normalize_unicode_py, m)?)?;
631    m.add_function(wrap_pyfunction!(normalize_whitespace_py, m)?)?;
632    m.add_function(wrap_pyfunction!(remove_accents_py, m)?)?;
633
634    Ok(())
635}