Struct BasicTextCleaner

Source

pub struct BasicTextCleaner { /* private fields */ }

Expand description

Text cleaner for removing special characters, extra whitespace, and stopwords

Implementations§

Source §

impl BasicTextCleaner

Source

pub fn new( remove_special_chars: bool, remove_stopwords: bool, normalize_whitespace: bool, ) -> Self

Create a new text cleaner

Examples found in repository ?

examples/text_processing_demo.rs (line 35)

14fn main() -> Result<(), Box<dyn std::error::Error>> {
15    println!("=== SciRS2 Text Processing Demo ===\n");
16
17    let documents = [
18        "The quick brown fox jumps over the lazy dog.",
19        "A fast red fox leaped over the sleeping canine.",
20        "Machine learning algorithms process textual data efficiently.",
21        "Text processing and natural language understanding are important.",
22    ];
23
24    // 1. Text Normalization
25    println!("1. Text Normalization");
26    let normalizer = BasicNormalizer::new(true, true);
27    for (i, doc) in documents.iter().enumerate() {
28        let normalized = normalizer.normalize(doc)?;
29        println!("Doc {}: {}", i + 1, normalized);
30    }
31    println!();
32
33    // 2. Text Cleaning
34    println!("2. Text Cleaning");
35    let cleaner = BasicTextCleaner::new(true, true, true);
36    for (i, doc) in documents.iter().enumerate() {
37        let cleaned = cleaner.clean(doc)?;
38        println!("Doc {}: {}", i + 1, cleaned);
39    }
40    println!();
41
42    // 3. Tokenization Examples
43    println!("3. Tokenization Examples");
44
45    // Word tokenization
46    let word_tokenizer = WordTokenizer::new(true);
47    let tokens = word_tokenizer.tokenize(documents[0])?;
48    println!("Word tokens: {tokens:?}");
49
50    // N-gram tokenization
51    let ngram_tokenizer = NgramTokenizer::new(2)?;
52    let ngrams = ngram_tokenizer.tokenize(documents[0])?;
53    println!("2-grams: {ngrams:?}");
54
55    // Regex tokenization
56    let regex_tokenizer = RegexTokenizer::new(r"\b\w+\b", false)?;
57    let regex_tokens = regex_tokenizer.tokenize(documents[0])?;
58    println!("Regex tokens: {regex_tokens:?}");
59    println!();
60
61    // 4. Stemming and Lemmatization
62    println!("4. Stemming and Lemmatization");
63    let porter_stemmer = PorterStemmer::new();
64    let lemmatizer = SimpleLemmatizer::new();
65
66    let test_words = vec!["running", "jumped", "better", "processing"];
67    for word in test_words {
68        let stemmed = porter_stemmer.stem(word)?;
69        let lemmatized = lemmatizer.stem(word)?;
70        println!("{word}: stemmed={stemmed}, lemmatized={lemmatized}");
71    }
72    println!();
73
74    // 5. Count Vectorization
75    println!("5. Count Vectorization");
76    let mut count_vectorizer = CountVectorizer::new(false);
77
78    let doc_refs = documents.to_vec();
79    count_vectorizer.fit(&doc_refs)?;
80
81    // Transform individual documents
82    let count_matrix = count_vectorizer.transform_batch(&doc_refs)?;
83    println!("Count vector shape: {:?}", count_matrix.shape());
84    println!("Vocabulary size: {}", count_vectorizer.vocabulary().len());
85
86    println!();
87
88    // 6. TF-IDF Vectorization
89    println!("6. TF-IDF Vectorization");
90    let mut tfidf_vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
91
92    tfidf_vectorizer.fit(&doc_refs)?;
93    let tfidf_matrix = tfidf_vectorizer.transform_batch(&doc_refs)?;
94
95    println!("TF-IDF vector shape: {:?}", tfidf_matrix.shape());
96    println!("Sample TF-IDF values:");
97    for i in 0..3.min(tfidf_matrix.nrows()) {
98        for j in 0..5.min(tfidf_matrix.ncols()) {
99            print!("{:.3} ", tfidf_matrix[[i, j]]);
100        }
101        println!();
102    }
103    println!();
104
105    // 7. Complete Pipeline Example
106    println!("7. Complete Text Processing Pipeline");
107    let testtext = "The cats were running quickly through the gardens.";
108
109    // Normalize
110    let normalized = normalizer.normalize(testtext)?;
111    println!("Normalized: {normalized}");
112
113    // Clean
114    let cleaned = cleaner.clean(&normalized)?;
115    println!("Cleaned: {cleaned}");
116
117    // Tokenize
118    let tokens = word_tokenizer.tokenize(&cleaned)?;
119    println!("Tokens: {tokens:?}");
120
121    // Stem
122    let stemmed_tokens: Result<Vec<_>, _> = tokens
123        .iter()
124        .map(|token| porter_stemmer.stem(token))
125        .collect();
126    let stemmed_tokens = stemmed_tokens?;
127    println!("Stemmed: {stemmed_tokens:?}");
128
129    Ok(())
130}

More examples

Hide additional examples

examples/enhanced_vectorization_demo.rs (line 117)

13fn main() -> Result<(), Box<dyn std::error::Error>> {
14    println!("=== Enhanced Text Vectorization Demo ===\n");
15
16    let documents = vec![
17        "The quick brown fox jumps over the lazy dog.",
18        "A fast red fox leaped over the sleeping canine.",
19        "Machine learning algorithms process textual data efficiently.",
20        "Text processing and natural language understanding are important.",
21        "Natural language processing is a field of artificial intelligence.",
22        "Deep learning models can understand complex text patterns.",
23    ];
24
25    // 1. Enhanced Count Vectorizer with Unigrams
26    println!("1. Enhanced Count Vectorizer (Unigrams only)");
27    let mut count_vec_unigram = EnhancedCountVectorizer::new()
28        .set_binary(false)
29        .set_max_features(Some(20));
30
31    count_vec_unigram.fit(&documents)?;
32    let count_matrix = count_vec_unigram.transform_batch(&documents)?;
33
34    println!("Vocabulary size: {}", count_vec_unigram.vocabulary().len());
35    println!("Count matrix shape: {:?}", count_matrix.shape());
36    println!();
37
38    // 2. Enhanced Count Vectorizer with N-grams
39    println!("2. Enhanced Count Vectorizer (Unigrams + Bigrams)");
40    let mut count_vec_ngram = EnhancedCountVectorizer::new()
41        .set_ngram_range((1, 2))?
42        .set_max_features(Some(30));
43
44    count_vec_ngram.fit(&documents)?;
45    let ngram_matrix = count_vec_ngram.transform_batch(&documents)?;
46
47    println!(
48        "Vocabulary size with n-grams: {}",
49        count_vec_ngram.vocabulary().len()
50    );
51    println!("N-gram count matrix shape: {:?}", ngram_matrix.shape());
52
53    // Show some n-gram tokens
54    let vocab = count_vec_ngram.vocabulary();
55    let mut ngram_tokens: Vec<String> = Vec::new();
56    for i in 0..vocab.len().min(10) {
57        if let Some(token) = vocab.get_token(i) {
58            if token.contains(' ') {
59                // This is a bigram
60                ngram_tokens.push(token.to_string());
61            }
62        }
63    }
64    println!("Sample bigrams: {ngram_tokens:?}");
65    println!();
66
67    // 3. Enhanced Count Vectorizer with Document Frequency Filtering
68    println!("3. Count Vectorizer with Document Frequency Filtering");
69    let mut count_vec_filtered = EnhancedCountVectorizer::new()
70        .set_min_df(0.3)?  // Token must appear in at least 30% of documents
71        .set_max_df(0.8)?; // Token must appear in at most 80% of documents
72
73    count_vec_filtered.fit(&documents)?;
74
75    println!(
76        "Vocabulary size after DF filtering: {}",
77        count_vec_filtered.vocabulary().len()
78    );
79    println!();
80
81    // 4. Enhanced TF-IDF Vectorizer with N-grams
82    println!("4. Enhanced TF-IDF Vectorizer with N-grams");
83    let mut tfidf_vec = EnhancedTfidfVectorizer::new()
84        .set_ngram_range((1, 2))?
85        .set_max_features(Some(50))
86        .set_smooth_idf(true)
87        .set_sublinear_tf(true)
88        .set_norm(Some("l2".to_string()))?;
89
90    tfidf_vec.fit(&documents)?;
91    let tfidf_matrix = tfidf_vec.transform_batch(&documents)?;
92
93    println!("TF-IDF matrix shape: {:?}", tfidf_matrix.shape());
94    println!("TF-IDF with smoothing and sublinear TF applied");
95
96    // Show TF-IDF values for first document
97    let first_doc_tfidf = tfidf_matrix.row(0);
98    let mut top_features: Vec<(String, f64)> = Vec::new();
99
100    for (idx, &value) in first_doc_tfidf.iter().enumerate() {
101        if value > 0.0 {
102            if let Some(token) = tfidf_vec.vocabulary().get_token(idx) {
103                top_features.push((token.to_string(), value));
104            }
105        }
106    }
107
108    top_features.sort_by(|a, b| b.1.partial_cmp(&a.1).expect("Operation failed"));
109    println!("\nTop TF-IDF features for first document:");
110    for (token, score) in top_features.iter().take(5) {
111        println!("  {token}: {score:.3}");
112    }
113    println!();
114
115    // 5. Processing with Text Cleaning
116    println!("5. Vectorization with Text Preprocessing");
117    let cleaner = BasicTextCleaner::new(true, true, true);
118
119    // Clean documents first
120    let cleaned_docs: Result<Vec<_>, _> = documents.iter().map(|doc| cleaner.clean(doc)).collect();
121    let cleaned_docs = cleaned_docs?;
122    let cleaned_refs: Vec<&str> = cleaned_docs.iter().map(|s| s.as_str()).collect();
123
124    let mut tfidf_cleaned = EnhancedTfidfVectorizer::new()
125        .set_ngram_range((1, 2))?
126        .set_max_features(Some(30));
127
128    tfidf_cleaned.fit(&cleaned_refs)?;
129    let cleaned_matrix = tfidf_cleaned.transform_batch(&cleaned_refs)?;
130
131    println!("TF-IDF shape after cleaning: {:?}", cleaned_matrix.shape());
132    println!("Processing pipeline: Clean -> Tokenize -> Vectorize");
133
134    Ok(())
135}

Source

pub fn with_stopwords( remove_special_chars: bool, remove_stopwords: bool, normalize_whitespace: bool, stopwords: HashSet<String>, ) -> Self

Create a text cleaner with custom stopwords

Source

pub fn add_stopwords(&mut self, words: &[&str])

Add stopwords to the cleaner

Source

pub fn is_stopword(&self, word: &str) -> bool

Check if a word is a stopword

Trait Implementations§

Source §

impl Clone for BasicTextCleaner

Source §

fn clone(&self) -> BasicTextCleaner

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for BasicTextCleaner

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for BasicTextCleaner

Source §

fn default() -> Self

Returns the “default value” for a type. Read more

Source §

impl TextCleaner for BasicTextCleaner

Source §

fn clean(&self, text: &str) -> Result<String>

Clean the input text

Source §

fn clean_batch(&self, texts: &[&str]) -> Result<Vec<String>>

Clean a batch of texts

Auto Trait Implementations§

§

impl UnwindSafe for BasicTextCleaner

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §