#![allow(clippy::disallowed_methods)]
#![allow(non_snake_case)]
use aprender::classification::{GaussianNB, LogisticRegression};
use aprender::text::stem::{PorterStemmer, Stemmer};
use aprender::text::stopwords::StopWordsFilter;
use aprender::text::tokenize::WhitespaceTokenizer;
use aprender::text::vectorize::{CountVectorizer, TfidfVectorizer};
use aprender::text::Tokenizer;
fn main() {
println!("╔════════════════════════════════════════════════════════════════╗");
println!("║ Text Classification with TF-IDF ║");
println!("╚════════════════════════════════════════════════════════════════╝\n");
example_1_bag_of_words();
println!("\n{}", "═".repeat(64));
example_2_tfidf_classification();
println!("\n{}", "═".repeat(64));
example_3_full_pipeline();
}
fn example_1_bag_of_words() {
println!("EXAMPLE 1: Sentiment Classification (Bag of Words)");
println!("{}", "─".repeat(64));
let train_docs = vec![
"this movie was excellent and amazing", "great film with wonderful acting", "fantastic movie loved every minute", "terrible movie waste of time", "awful film boring and disappointing", "horrible acting very bad movie", ];
let train_labels = vec![1, 1, 1, 0, 0, 0];
println!("\n📚 Training Data: {} movie reviews", train_docs.len());
println!(" Positive: \"{}\"", train_docs[0]);
println!(" Negative: \"{}\"", train_docs[3]);
println!("\n🔧 Vectorization: Bag of Words");
let mut vectorizer = CountVectorizer::new()
.with_tokenizer(Box::new(WhitespaceTokenizer::new()))
.with_max_features(20);
let X_train = vectorizer
.fit_transform(&train_docs)
.expect("Vectorization should succeed");
println!(" Vocabulary size: {}", vectorizer.vocabulary_size());
println!(
" Feature matrix: {} × {}",
X_train.n_rows(),
X_train.n_cols()
);
let mut vocab: Vec<_> = vectorizer.vocabulary().iter().collect();
vocab.sort_by_key(|&(_, idx)| idx);
println!(
"\n Top words: {:?}",
vocab
.iter()
.take(10)
.map(|(word, _)| *word)
.collect::<Vec<_>>()
);
let X_train_f32 = convert_matrix_to_f32(&X_train);
println!("\n🤖 Training: Gaussian Naive Bayes Classifier");
let mut classifier = GaussianNB::new();
classifier
.fit(&X_train_f32, &train_labels)
.expect("Training should succeed");
let test_docs = vec![
"excellent movie great acting", "terrible film very bad", ];
let X_test = vectorizer
.transform(&test_docs)
.expect("Transform should succeed");
let X_test_f32 = convert_matrix_to_f32(&X_test);
let predictions = classifier
.predict(&X_test_f32)
.expect("Prediction should succeed");
println!("\n🔮 Predictions:");
for (i, doc) in test_docs.iter().enumerate() {
let sentiment = if predictions[i] == 1 {
"Positive ✅"
} else {
"Negative ❌"
};
println!(" \"{doc}\"");
println!(" → {sentiment}");
}
let train_pred = classifier
.predict(&X_train_f32)
.expect("Prediction should succeed");
let accuracy = train_pred
.iter()
.zip(&train_labels)
.filter(|(p, l)| p == l)
.count() as f64
/ train_labels.len() as f64;
println!("\n📊 Training Accuracy: {:.1}%", accuracy * 100.0);
}
fn example_2_tfidf_classification() {
println!("EXAMPLE 2: Topic Classification (TF-IDF)");
println!("{}", "─".repeat(64));
let train_docs = vec![
"python programming language machine learning", "artificial intelligence neural networks deep", "software development code rust programming", "basketball game score team championship", "football soccer match goal tournament", "tennis player serves match competition", ];
let train_labels = vec![0, 0, 0, 1, 1, 1];
println!("\n📚 Training Data: {} articles", train_docs.len());
println!(" Tech: \"{}\"", train_docs[0]);
println!(" Sports: \"{}\"", train_docs[3]);
println!("\n🔧 Vectorization: TF-IDF");
let mut vectorizer =
TfidfVectorizer::new().with_tokenizer(Box::new(WhitespaceTokenizer::new()));
let X_train = vectorizer
.fit_transform(&train_docs)
.expect("Vectorization should succeed");
println!(" Vocabulary size: {}", vectorizer.vocabulary_size());
println!(
" TF-IDF matrix: {} × {}",
X_train.n_rows(),
X_train.n_cols()
);
println!("\n IDF values (sample):");
let mut vocab: Vec<_> = vectorizer.vocabulary().iter().collect();
vocab.sort_by_key(|&(_, idx)| idx);
for (word, &idx) in vocab.iter().take(5) {
println!(" {}: {:.3}", word, vectorizer.idf_values()[idx]);
}
let X_train_f32 = convert_matrix_to_f32(&X_train);
println!("\n🤖 Training: Logistic Regression");
let mut classifier = LogisticRegression::new()
.with_learning_rate(0.1)
.with_max_iter(100);
classifier
.fit(&X_train_f32, &train_labels)
.expect("Training should succeed");
let test_docs = vec![
"programming code algorithm", "basketball score game", "neural network learning", ];
let X_test = vectorizer
.transform(&test_docs)
.expect("Transform should succeed");
let X_test_f32 = convert_matrix_to_f32(&X_test);
let predictions = classifier.predict(&X_test_f32);
println!("\n🔮 Predictions:");
for (i, doc) in test_docs.iter().enumerate() {
let topic = if predictions[i] == 0 {
"Tech 💻"
} else {
"Sports ⚽"
};
println!(" \"{doc}\"");
println!(" → {topic}");
}
let train_pred = classifier.predict(&X_train_f32);
let accuracy = train_pred
.iter()
.zip(&train_labels)
.filter(|(p, l)| p == l)
.count() as f64
/ train_labels.len() as f64;
println!("\n📊 Training Accuracy: {:.1}%", accuracy * 100.0);
}
fn example_3_full_pipeline() {
println!("EXAMPLE 3: Full Text Classification Pipeline");
println!("{}", "─".repeat(64));
let raw_docs = vec![
"The machine learning algorithms are improving rapidly with deep neural networks",
"The team scored three goals in the championship football match yesterday",
"Scientists developed new artificial intelligence systems using transformers",
"The basketball players trained hard for the upcoming tournament games",
];
let labels = vec![0, 1, 0, 1];
println!("\n📄 Raw Documents: {} articles", raw_docs.len());
println!("\n📍 Step 1: Tokenization");
let tokenizer = WhitespaceTokenizer::new();
let mut tokenized_docs = Vec::new();
for doc in &raw_docs {
let tokens = tokenizer.tokenize(doc).expect("Tokenize should succeed");
tokenized_docs.push(tokens);
}
println!(" Tokens (doc 1): {} words", tokenized_docs[0].len());
println!("\n📍 Step 2: Lowercase + Stop Words Filtering");
let filter = StopWordsFilter::english();
let mut filtered_docs = Vec::new();
for tokens in &tokenized_docs {
let lower: Vec<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
let filtered = filter.filter(&lower).expect("Filter should succeed");
filtered_docs.push(filtered);
}
println!(" Removed stop words");
println!(
" Before: {} words → After: {} words",
tokenized_docs[0].len(),
filtered_docs[0].len()
);
println!("\n📍 Step 3: Stemming");
let stemmer = PorterStemmer::new();
let mut stemmed_docs = Vec::new();
for tokens in &filtered_docs {
let stemmed = stemmer
.stem_tokens(tokens)
.expect("Stemming should succeed");
stemmed_docs.push(stemmed);
}
println!(" Applied Porter stemmer");
println!(" Sample: {:?}", &stemmed_docs[0][..3]);
let processed_docs: Vec<String> = stemmed_docs.iter().map(|tokens| tokens.join(" ")).collect();
println!("\n📍 Step 4: TF-IDF Vectorization");
let mut vectorizer =
TfidfVectorizer::new().with_tokenizer(Box::new(WhitespaceTokenizer::new()));
let X = vectorizer
.fit_transform(&processed_docs)
.expect("Vectorization should succeed");
println!(
" Vocabulary: {} unique stems",
vectorizer.vocabulary_size()
);
println!(" TF-IDF matrix: {} × {}", X.n_rows(), X.n_cols());
println!("\n📍 Step 5: Classification (Gaussian Naive Bayes)");
let X_f32 = convert_matrix_to_f32(&X);
let mut classifier = GaussianNB::new();
classifier
.fit(&X_f32, &labels)
.expect("Training should succeed");
let predictions = classifier
.predict(&X_f32)
.expect("Prediction should succeed");
println!("\n🔮 Results:");
for (i, doc) in raw_docs.iter().enumerate() {
let topic = if predictions[i] == 0 {
"Tech"
} else {
"Sports"
};
let correct = if predictions[i] == labels[i] {
"✅"
} else {
"❌"
};
println!("\n Doc {}: {correct}", i + 1);
println!(" Text: \"{}...\"", &doc[..50]);
println!(" Predicted: {topic}");
}
let accuracy = predictions
.iter()
.zip(&labels)
.filter(|(p, l)| p == l)
.count() as f64
/ labels.len() as f64;
println!("\n📊 Pipeline Performance:");
println!(" Accuracy: {:.1}%", accuracy * 100.0);
println!(
" Vocabulary reduction: {:.1}%",
(1.0 - vectorizer.vocabulary_size() as f64 / 20.0) * 100.0
);
println!("\n💡 Pipeline Summary:");
println!(" • Tokenization → Stop words → Stemming");
println!(" • TF-IDF vectorization");
println!(" • Naive Bayes classification");
println!(" • Ready for production use!");
}
fn convert_matrix_to_f32(
m: &aprender::primitives::Matrix<f64>,
) -> aprender::primitives::Matrix<f32> {
let data: Vec<f32> = m.as_slice().iter().map(|&x| x as f32).collect();
aprender::primitives::Matrix::from_vec(m.n_rows(), m.n_cols(), data)
.expect("Conversion should succeed")
}