#![allow(clippy::disallowed_methods)]
use aprender::text::stem::{PorterStemmer, Stemmer};
use aprender::text::stopwords::StopWordsFilter;
use aprender::text::tokenize::{CharTokenizer, WhitespaceTokenizer, WordTokenizer};
use aprender::text::Tokenizer;
fn main() {
println!("╔════════════════════════════════════════════════════════════════╗");
println!("║ Text Preprocessing & NLP Examples ║");
println!("╚════════════════════════════════════════════════════════════════╝\n");
example_1_tokenization();
println!("\n{}", "═".repeat(64));
example_2_stopwords();
println!("\n{}", "═".repeat(64));
example_3_stemming();
println!("\n{}", "═".repeat(64));
example_4_full_pipeline();
}
fn example_1_tokenization() {
println!("EXAMPLE 1: Tokenization Strategies");
println!("{}", "─".repeat(64));
let text = "Hello, world! Natural Language Processing is amazing.";
println!("\n📝 Input Text:");
println!(" \"{text}\"");
println!("\n🔤 Whitespace Tokenization:");
println!(" Splits on spaces, preserves punctuation");
let whitespace_tokenizer = WhitespaceTokenizer::new();
let whitespace_tokens = whitespace_tokenizer
.tokenize(text)
.expect("Tokenization should succeed");
println!(" Tokens: {whitespace_tokens:?}");
println!(" Count: {} tokens", whitespace_tokens.len());
println!("\n🔤 Word Tokenization:");
println!(" Splits on spaces, separates punctuation");
let word_tokenizer = WordTokenizer::new();
let word_tokens = word_tokenizer
.tokenize(text)
.expect("Tokenization should succeed");
println!(" Tokens: {word_tokens:?}");
println!(" Count: {} tokens", word_tokens.len());
println!("\n🔤 Character Tokenization:");
println!(" Splits into individual characters");
let char_tokenizer = CharTokenizer::new();
let char_tokens = char_tokenizer
.tokenize("NLP")
.expect("Tokenization should succeed");
println!(" Input: \"NLP\"");
println!(" Tokens: {char_tokens:?}");
println!(" Count: {} characters", char_tokens.len());
println!("\n💡 Analysis:");
println!(" • Whitespace: Simple, fast, preserves punctuation");
println!(" • Word: Better for analysis, separates punctuation");
println!(" • Character: Useful for character-level models");
}
fn example_2_stopwords() {
println!("EXAMPLE 2: Stop Words Filtering");
println!("{}", "─".repeat(64));
let text = "The quick brown fox jumps over the lazy dog in the garden";
println!("\n📝 Input Text:");
println!(" \"{text}\"");
let tokenizer = WhitespaceTokenizer::new();
let tokens = tokenizer
.tokenize(text)
.expect("Tokenization should succeed");
println!("\n🔤 Original Tokens ({} words):", tokens.len());
println!(" {tokens:?}");
println!("\n🚫 Stop Words Filtering:");
println!(" Removing common words (the, in, over, etc.)");
let filter = StopWordsFilter::english();
let filtered_tokens = filter.filter(&tokens).expect("Filter should succeed");
println!("\n✅ Filtered Tokens ({} words):", filtered_tokens.len());
println!(" {filtered_tokens:?}");
println!("\n💡 Analysis:");
let reduction = 100.0 * (1.0 - filtered_tokens.len() as f64 / tokens.len() as f64);
println!(" Token reduction: {reduction:.1}%");
println!(" Original: {} words", tokens.len());
println!(" Filtered: {} words", filtered_tokens.len());
println!(
" Removed: {} stop words",
tokens.len() - filtered_tokens.len()
);
println!("\n🎯 Custom Stop Words:");
println!(" Filtering domain-specific words");
let custom_filter = StopWordsFilter::new(vec!["fox", "dog", "garden"]);
let custom_filtered = custom_filter
.filter(&filtered_tokens)
.expect("Filter should succeed");
println!(" Custom list: [\"fox\", \"dog\", \"garden\"]");
println!(" Result: {custom_filtered:?}");
}
fn example_3_stemming() {
println!("EXAMPLE 3: Stemming (Word Normalization)");
println!("{}", "─".repeat(64));
println!("\n📐 Porter Stemming Algorithm:");
println!(" Reduces words to their root form");
let stemmer = PorterStemmer::new();
println!("\n🔍 Single Word Stemming:");
let examples = vec![
("running", "Verb gerund → root"),
("studies", "Plural noun → singular"),
("better", "Comparative → base"),
("flying", "Verb gerund → root"),
("happiness", "Noun with suffix → root"),
("easily", "Adverb → adjective"),
];
for (word, description) in &examples {
let stemmed = stemmer.stem(word).expect("Stem should succeed");
println!(" {word} → {stemmed} ({description})");
}
println!("\n🔍 Batch Stemming:");
let words = vec!["running", "jumped", "flying", "studies", "cats", "quickly"];
let stemmed = stemmer.stem_tokens(&words).expect("Stem should succeed");
println!(" Original: {words:?}");
println!(" Stemmed: {stemmed:?}");
println!("\n💡 Analysis:");
println!(" • Normalizes word variations to common root");
println!(" • Reduces vocabulary size for ML models");
println!(" • Groups semantically similar words");
println!(" • Trade-off: May over-stem (\"better\" → \"better\")");
}
fn example_4_full_pipeline() {
println!("EXAMPLE 4: Complete NLP Preprocessing Pipeline");
println!("{}", "─".repeat(64));
let document = "The students are studying machine learning algorithms. \
They're analyzing different classification models and \
comparing their performances on various datasets.";
println!("\n📄 Input Document:");
println!(" \"{document}\"");
println!("\n📍 Step 1: Tokenization");
let tokenizer = WordTokenizer::new();
let tokens = tokenizer
.tokenize(document)
.expect("Tokenization should succeed");
println!(" Tokenizer: WordTokenizer (separates punctuation)");
println!(" Tokens: {} items", tokens.len());
println!(" Sample: {:?}...", &tokens[..10]);
println!("\n📍 Step 2: Lowercase Normalization");
let lowercase_tokens: Vec<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
println!(" Converted all tokens to lowercase");
println!(" Sample: {:?}...", &lowercase_tokens[..10]);
println!("\n📍 Step 3: Stop Words Filtering");
let filter = StopWordsFilter::english();
let filtered_tokens = filter
.filter(&lowercase_tokens)
.expect("Filter should succeed");
println!(" Removed common English stop words");
println!(
" Tokens: {} items (removed {})",
filtered_tokens.len(),
lowercase_tokens.len() - filtered_tokens.len()
);
println!(
" Sample: {:?}...",
&filtered_tokens[..8.min(filtered_tokens.len())]
);
println!("\n📍 Step 4: Stemming");
let stemmer = PorterStemmer::new();
let stemmed_tokens = stemmer
.stem_tokens(&filtered_tokens)
.expect("Stem should succeed");
println!(" Applied Porter stemmer");
println!(" Sample transformations:");
for (original, stemmed) in filtered_tokens.iter().zip(stemmed_tokens.iter()).take(8) {
if original != stemmed {
println!(" {original} → {stemmed}");
}
}
println!("\n✅ Final Processed Tokens:");
println!(" {stemmed_tokens:?}");
println!("\n📊 Pipeline Statistics:");
println!(" Original tokens: {}", tokens.len());
println!(" After lowercasing: {}", lowercase_tokens.len());
println!(" After stop words: {}", filtered_tokens.len());
println!(" After stemming: {}", stemmed_tokens.len());
let reduction = 100.0 * (1.0 - stemmed_tokens.len() as f64 / tokens.len() as f64);
println!(" Total reduction: {reduction:.1}%");
println!("\n💡 Pipeline Benefits:");
println!(" • Normalized text representation");
println!(" • Reduced vocabulary size");
println!(" • Ready for feature extraction (TF-IDF, word embeddings)");
println!(" • Improved ML model performance");
println!("\n🎯 Next Steps:");
println!(" • Vectorization (Bag of Words, TF-IDF)");
println!(" • Feature extraction");
println!(" • Model training (classification, clustering)");
}