enhanced_vectorization_demo/
enhanced_vectorization_demo.rs1use scirs2_text::{
7 enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer},
8 preprocess::BasicTextCleaner,
9 preprocess::TextCleaner,
10};
11
12#[allow(dead_code)]
13fn main() -> Result<(), Box<dyn std::error::Error>> {
14 println!("=== Enhanced Text Vectorization Demo ===\n");
15
16 let documents = vec![
17 "The quick brown fox jumps over the lazy dog.",
18 "A fast red fox leaped over the sleeping canine.",
19 "Machine learning algorithms process textual data efficiently.",
20 "Text processing and natural language understanding are important.",
21 "Natural language processing is a field of artificial intelligence.",
22 "Deep learning models can understand complex text patterns.",
23 ];
24
25 println!("1. Enhanced Count Vectorizer (Unigrams only)");
27 let mut count_vec_unigram = EnhancedCountVectorizer::new()
28 .set_binary(false)
29 .set_max_features(Some(20));
30
31 count_vec_unigram.fit(&documents)?;
32 let count_matrix = count_vec_unigram.transform_batch(&documents)?;
33
34 println!("Vocabulary size: {}", count_vec_unigram.vocabulary().len());
35 println!("Count matrix shape: {:?}", count_matrix.shape());
36 println!();
37
38 println!("2. Enhanced Count Vectorizer (Unigrams + Bigrams)");
40 let mut count_vec_ngram = EnhancedCountVectorizer::new()
41 .set_ngram_range((1, 2))?
42 .set_max_features(Some(30));
43
44 count_vec_ngram.fit(&documents)?;
45 let ngram_matrix = count_vec_ngram.transform_batch(&documents)?;
46
47 println!(
48 "Vocabulary size with n-grams: {}",
49 count_vec_ngram.vocabulary().len()
50 );
51 println!("N-gram count matrix shape: {:?}", ngram_matrix.shape());
52
53 let vocab = count_vec_ngram.vocabulary();
55 let mut ngram_tokens: Vec<String> = Vec::new();
56 for i in 0..vocab.len().min(10) {
57 if let Some(token) = vocab.get_token(i) {
58 if token.contains(' ') {
59 ngram_tokens.push(token.to_string());
61 }
62 }
63 }
64 println!("Sample bigrams: {ngram_tokens:?}");
65 println!();
66
67 println!("3. Count Vectorizer with Document Frequency Filtering");
69 let mut count_vec_filtered = EnhancedCountVectorizer::new()
70 .set_min_df(0.3)? .set_max_df(0.8)?; count_vec_filtered.fit(&documents)?;
74
75 println!(
76 "Vocabulary size after DF filtering: {}",
77 count_vec_filtered.vocabulary().len()
78 );
79 println!();
80
81 println!("4. Enhanced TF-IDF Vectorizer with N-grams");
83 let mut tfidf_vec = EnhancedTfidfVectorizer::new()
84 .set_ngram_range((1, 2))?
85 .set_max_features(Some(50))
86 .set_smooth_idf(true)
87 .set_sublinear_tf(true)
88 .set_norm(Some("l2".to_string()))?;
89
90 tfidf_vec.fit(&documents)?;
91 let tfidf_matrix = tfidf_vec.transform_batch(&documents)?;
92
93 println!("TF-IDF matrix shape: {:?}", tfidf_matrix.shape());
94 println!("TF-IDF with smoothing and sublinear TF applied");
95
96 let first_doc_tfidf = tfidf_matrix.row(0);
98 let mut top_features: Vec<(String, f64)> = Vec::new();
99
100 for (idx, &value) in first_doc_tfidf.iter().enumerate() {
101 if value > 0.0 {
102 if let Some(token) = tfidf_vec.vocabulary().get_token(idx) {
103 top_features.push((token.to_string(), value));
104 }
105 }
106 }
107
108 top_features.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
109 println!("\nTop TF-IDF features for first document:");
110 for (token, score) in top_features.iter().take(5) {
111 println!(" {token}: {score:.3}");
112 }
113 println!();
114
115 println!("5. Vectorization with Text Preprocessing");
117 let cleaner = BasicTextCleaner::new(true, true, true);
118
119 let cleaned_docs: Result<Vec<_>, _> = documents.iter().map(|doc| cleaner.clean(doc)).collect();
121 let cleaned_docs = cleaned_docs?;
122 let cleaned_refs: Vec<&str> = cleaned_docs.iter().map(|s| s.as_str()).collect();
123
124 let mut tfidf_cleaned = EnhancedTfidfVectorizer::new()
125 .set_ngram_range((1, 2))?
126 .set_max_features(Some(30));
127
128 tfidf_cleaned.fit(&cleaned_refs)?;
129 let cleaned_matrix = tfidf_cleaned.transform_batch(&cleaned_refs)?;
130
131 println!("TF-IDF shape after cleaning: {:?}", cleaned_matrix.shape());
132 println!("Processing pipeline: Clean -> Tokenize -> Vectorize");
133
134 Ok(())
135}