TextDataset

Struct TextDataset 

Source
pub struct TextDataset {
    pub texts: Vec<String>,
    pub labels: Vec<String>,
    /* private fields */
}
Expand description

Text classification dataset

Fields§

§texts: Vec<String>

The text samples

§labels: Vec<String>

The labels for each text

Implementations§

Source§

impl TextDataset

Source

pub fn new(texts: Vec<String>, labels: Vec<String>) -> Result<Self>

Create a new text dataset

Examples found in repository?
examples/ml_sentiment_demo.rs (line 230)
161fn create_sentiment_dataset() -> Result<(TextDataset, TextDataset), Box<dyn std::error::Error>> {
162    // Training data
163    let traintexts = vec![
164        "I absolutely loved this movie! The acting was superb.",
165        "Terrible experience, would not recommend to anyone.",
166        "The product was okay, nothing special but it works.",
167        "Great customer service and fast delivery.",
168        "Disappointing quality for the price paid.",
169        "This is the best purchase I've made all year!",
170        "Waste of money, doesn't work as advertised.",
171        "Mixed feelings about this. Some parts good, others bad.",
172        "Pleasantly surprised by how well this performs.",
173        "Not worth the price. Broke after two weeks.",
174        "Amazing value for the price. Highly recommended!",
175        "Mediocre at best. Wouldn't buy again.",
176        "Fantastic product that exceeds expectations.",
177        "Poor construction quality, arrived damaged.",
178        "It's decent but there are better options available.",
179        "This changed my life! Can't imagine living without it.",
180        "Regret buying this. Customer service was unhelpful.",
181        "Satisfied with my purchase, does what it claims.",
182        "Best in its class. Outstanding performance.",
183        "Very disappointed, doesn't match the description.",
184        "Just average, nothing to write home about.",
185        "Exceeded my expectations in every way.",
186        "One of the worst products I've ever bought.",
187        "Good enough for the price, but has limitations.",
188        "Incredible value! Works perfectly for my needs.",
189        "Would not purchase again. Many flaws.",
190        "Does the job fine, but nothing spectacular.",
191        "Absolutely worthless. Don't waste your money.",
192        "A solid choice. Reliable and well-designed.",
193        "Not impressed at all. Many issues from day one.",
194    ];
195
196    let train_labels = vec![
197        "positive", "negative", "neutral", "positive", "negative", "positive", "negative",
198        "neutral", "positive", "negative", "positive", "negative", "positive", "negative",
199        "neutral", "positive", "negative", "neutral", "positive", "negative", "neutral",
200        "positive", "negative", "neutral", "positive", "negative", "neutral", "negative",
201        "positive", "negative",
202    ];
203
204    // Test data (different examples)
205    let testtexts = [
206        "Loved every minute of it. Highly recommended!",
207        "Terrible product. Complete waste of money.",
208        "It's okay, nothing special but gets the job done.",
209        "Outstanding quality and service. Will buy again!",
210        "Very poor experience. Many issues encountered.",
211        "Adequate for basic needs, but lacks advanced features.",
212        "Couldn't be happier with this purchase.",
213        "Avoid at all costs. Terrible quality.",
214        "Average performance. Neither good nor bad.",
215        "Top-notch quality and design. Very impressed!",
216    ];
217
218    let test_labels = [
219        "positive", "negative", "neutral", "positive", "negative", "neutral", "positive",
220        "negative", "neutral", "positive",
221    ];
222
223    // Convert to strings
224    let traintexts = traintexts.iter().map(|t| t.to_string()).collect();
225    let train_labels = train_labels.iter().map(|l| l.to_string()).collect();
226    let testtexts = testtexts.iter().map(|t| t.to_string()).collect();
227    let test_labels = test_labels.iter().map(|l| l.to_string()).collect();
228
229    // Create datasets
230    let train_dataset = TextDataset::new(traintexts, train_labels)?;
231    let test_dataset = TextDataset::new(testtexts, test_labels)?;
232
233    Ok((train_dataset, test_dataset))
234}
More examples
Hide additional examples
examples/text_classification_demo.rs (line 32)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Text Classification Demo");
10    println!("=======================\n");
11
12    // Create sample dataset
13    let texts = vec![
14        "This movie is absolutely fantastic and amazing!".to_string(),
15        "I really hated this film, it was terrible.".to_string(),
16        "The acting was superb and the plot was engaging.".to_string(),
17        "Worst movie I've ever seen, complete waste of time.".to_string(),
18        "A masterpiece of cinema, truly exceptional work.".to_string(),
19        "Boring, predictable, and poorly executed.".to_string(),
20    ];
21
22    let labels = vec![
23        "positive".to_string(),
24        "negative".to_string(),
25        "positive".to_string(),
26        "negative".to_string(),
27        "positive".to_string(),
28        "negative".to_string(),
29    ];
30
31    // Create dataset
32    let dataset = TextDataset::new(texts, labels)?;
33    println!("Dataset Statistics:");
34    println!("  Total samples: {}", dataset.len());
35    println!("  Number of classes: {}", dataset.unique_labels().len());
36    println!();
37
38    // Split into train and test
39    let (train_dataset, test_dataset) = dataset.train_test_split(0.33, Some(42))?;
40    println!("Train/Test Split:");
41    println!("  Training samples: {}", train_dataset.len());
42    println!("  Test samples: {}", test_dataset.len());
43    println!();
44
45    // Create text processing pipeline
46    let mut pipeline = TextClassificationPipeline::with_tfidf();
47
48    // Fit the pipeline
49    pipeline.fit(&train_dataset)?;
50
51    // Transform to features
52    let train_features = pipeline.transform(&train_dataset)?;
53    let test_features = pipeline.transform(&test_dataset)?;
54
55    println!("Feature Extraction:");
56    println!(
57        "  Train feature shape: ({}, {})",
58        train_features.nrows(),
59        train_features.ncols()
60    );
61    println!(
62        "  Test feature shape: ({}, {})",
63        test_features.nrows(),
64        test_features.ncols()
65    );
66    println!();
67
68    // Demonstrate feature selection
69    let mut feature_selector = TextFeatureSelector::new()
70        .set_max_features(10.0)?
71        .set_min_df(0.1)?
72        .set_max_df(0.9)?;
73
74    let selected_train_features = feature_selector.fit_transform(&train_features)?;
75    println!("Feature Selection:");
76    println!("  Selected features: {}", selected_train_features.ncols());
77    println!();
78
79    // Simulate classification results (in a real scenario, you'd use a classifier)
80    // For demo purposes, we'll create mock predictions based on simple heuristics
81    let _unique_labels = train_dataset.unique_labels();
82
83    // Create binary labels (0 for negative, 1 for positive) for this demo
84    let mut train_labels = Vec::new();
85    let mut test_labels = Vec::new();
86
87    for label in &train_dataset.labels {
88        train_labels.push(if label == "positive" { 1 } else { 0 });
89    }
90
91    for label in &test_dataset.labels {
92        test_labels.push(if label == "positive" { 1 } else { 0 });
93    }
94
95    // Mock predictions (in practice, use a real classifier)
96    let predictions = test_labels.clone(); // Perfect predictions for demo
97
98    // Calculate metrics
99    let metrics = TextClassificationMetrics::new();
100    let accuracy = metrics.accuracy(&predictions, &test_labels)?;
101    let (precision, recall, f1) = metrics.binary_metrics(&predictions, &test_labels)?;
102
103    println!("Classification Metrics:");
104    println!("  Accuracy: {:.2}%", accuracy * 100.0);
105    println!("  Precision: {:.2}%", precision * 100.0);
106    println!("  Recall: {:.2}%", recall * 100.0);
107    println!("  F1 Score: {:.2}%", f1 * 100.0);
108    println!();
109
110    // Create a simple confusion matrix manually since the method isn't available
111    let mut true_positive = 0;
112    let mut true_negative = 0;
113    let mut false_positive = 0;
114    let mut false_negative = 0;
115
116    for (pred, actual) in predictions.iter().zip(test_labels.iter()) {
117        match (pred, actual) {
118            (1, 1) => true_positive += 1,
119            (0, 0) => true_negative += 1,
120            (1, 0) => false_positive += 1,
121            (0, 1) => false_negative += 1,
122            _ => {}
123        }
124    }
125
126    println!("Confusion Matrix:");
127    println!("[ {true_negative} {false_positive} ]");
128    println!("[ {false_negative} {true_positive} ]");
129
130    Ok(())
131}
examples/ml_integration_demo.rs (lines 30-33)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Machine Learning Integration Demo");
10    println!("================================\n");
11
12    // Sample dataset for demonstration
13    let texts = [
14        "This product is absolutely amazing! I love it.",
15        "Terrible experience, would not recommend.",
16        "It's okay, nothing special but works fine.",
17        "Excellent quality and fast shipping.",
18        "Complete waste of money, very disappointed.",
19        "Good value for the price, satisfied with purchase.",
20        "Outstanding service and great product!",
21        "Not worth it, many issues with this item.",
22    ];
23
24    let labels = [
25        "positive", "negative", "neutral", "positive", "negative", "positive", "positive",
26        "negative",
27    ];
28
29    // Create dataset
30    let dataset = TextDataset::new(
31        texts.iter().map(|s| s.to_string()).collect(),
32        labels.iter().map(|s| s.to_string()).collect(),
33    )?;
34
35    // Demonstrate different feature extraction modes
36    println!("1. TF-IDF Feature Extraction");
37    println!("---------------------------");
38
39    let mut tfidf_processor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf)
40        .with_tfidf_params(0.1, 0.9, Some(100));
41
42    let text_refs = texts.to_vec();
43    let tfidf_features = tfidf_processor.fit_transform(&text_refs)?;
44
45    println!(
46        "TF-IDF Features shape: {:?}",
47        tfidf_features.features.shape()
48    );
49    println!(
50        "First document features (first 5 values): {:?}\n",
51        &tfidf_features
52            .features
53            .row(0)
54            .iter()
55            .take(5)
56            .collect::<Vec<_>>()
57    );
58
59    // Topic modeling features
60    println!("2. Topic Modeling Features");
61    println!("-------------------------");
62
63    let mut topic_processor =
64        MLTextPreprocessor::new(FeatureExtractionMode::TopicModeling).with_topic_modeling(3);
65
66    let topic_features = topic_processor.fit_transform(&text_refs)?;
67
68    println!(
69        "Topic Features shape: {:?}",
70        topic_features.features.shape()
71    );
72    println!(
73        "Topic distribution for first document: {:?}\n",
74        topic_features.features.row(0)
75    );
76
77    // Combined features
78    println!("3. Combined Features");
79    println!("-------------------");
80
81    let mut combined_processor = MLTextPreprocessor::new(FeatureExtractionMode::Combined);
82    let combined_features = combined_processor.fit_transform(&text_refs)?;
83
84    println!(
85        "Combined Features shape: {:?}",
86        combined_features.features.shape()
87    );
88    println!("Metadata: {:?}\n", combined_features.metadata);
89
90    // ML Pipeline
91    println!("4. ML Pipeline with Classification");
92    println!("---------------------------------");
93
94    let mut pipeline = TextMLPipeline::with_mode(FeatureExtractionMode::TfIdf)
95        .configure_preprocessor(|p| {
96            p.with_tfidf_params(0.0, 1.0, Some(50))
97                .with_feature_selection(20)
98        });
99
100    let features = pipeline.process(&text_refs)?;
101    println!("Pipeline features shape: {:?}", features.features.shape());
102
103    // Batch processing for large datasets
104    println!("\n5. Batch Processing");
105    println!("-------------------");
106
107    let mut batch_processor = BatchTextProcessor::new(3);
108    let batches = batch_processor.process_batches(&text_refs)?;
109
110    println!("Number of batches: {}", batches.len());
111    for (i, batch) in batches.iter().enumerate() {
112        println!("Batch {} shape: {:?}", i + 1, batch.features.shape());
113    }
114
115    // Feature extraction for classification
116    println!("\n6. Classification with ML Features");
117    println!("----------------------------------");
118
119    // Split data
120    let (train_dataset, test_dataset) = dataset.train_test_split(0.25, Some(42))?;
121
122    // Extract features
123    let traintexts: Vec<&str> = train_dataset.texts.iter().map(|s| s.as_ref()).collect();
124    let testtexts: Vec<&str> = test_dataset.texts.iter().map(|s| s.as_ref()).collect();
125
126    let mut feature_extractor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf);
127    feature_extractor.fit(&traintexts)?;
128
129    let train_features = feature_extractor.transform(&traintexts)?;
130    let test_features = feature_extractor.transform(&testtexts)?;
131
132    println!("Training features: {:?}", train_features.features.shape());
133    println!("Test features: {:?}", test_features.features.shape());
134
135    // In a real scenario, you would now use these features with a classifier
136    println!("\nFeatures are ready for machine learning models!");
137
138    // Demonstrate feature statistics
139    println!("\n7. Feature Statistics");
140    println!("--------------------");
141
142    let feature_means = train_features
143        .features
144        .mean_axis(scirs2_core::ndarray::Axis(0))
145        .unwrap();
146    let feature_stds = train_features
147        .features
148        .std_axis(scirs2_core::ndarray::Axis(0), 0.0);
149
150    println!(
151        "Mean of first 5 features: {:?}",
152        &feature_means.iter().take(5).collect::<Vec<_>>()
153    );
154    println!(
155        "Std of first 5 features: {:?}",
156        &feature_stds.iter().take(5).collect::<Vec<_>>()
157    );
158
159    Ok(())
160}
Source

pub fn len(&self) -> usize

Get the number of samples

Examples found in repository?
examples/text_classification_demo.rs (line 34)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Text Classification Demo");
10    println!("=======================\n");
11
12    // Create sample dataset
13    let texts = vec![
14        "This movie is absolutely fantastic and amazing!".to_string(),
15        "I really hated this film, it was terrible.".to_string(),
16        "The acting was superb and the plot was engaging.".to_string(),
17        "Worst movie I've ever seen, complete waste of time.".to_string(),
18        "A masterpiece of cinema, truly exceptional work.".to_string(),
19        "Boring, predictable, and poorly executed.".to_string(),
20    ];
21
22    let labels = vec![
23        "positive".to_string(),
24        "negative".to_string(),
25        "positive".to_string(),
26        "negative".to_string(),
27        "positive".to_string(),
28        "negative".to_string(),
29    ];
30
31    // Create dataset
32    let dataset = TextDataset::new(texts, labels)?;
33    println!("Dataset Statistics:");
34    println!("  Total samples: {}", dataset.len());
35    println!("  Number of classes: {}", dataset.unique_labels().len());
36    println!();
37
38    // Split into train and test
39    let (train_dataset, test_dataset) = dataset.train_test_split(0.33, Some(42))?;
40    println!("Train/Test Split:");
41    println!("  Training samples: {}", train_dataset.len());
42    println!("  Test samples: {}", test_dataset.len());
43    println!();
44
45    // Create text processing pipeline
46    let mut pipeline = TextClassificationPipeline::with_tfidf();
47
48    // Fit the pipeline
49    pipeline.fit(&train_dataset)?;
50
51    // Transform to features
52    let train_features = pipeline.transform(&train_dataset)?;
53    let test_features = pipeline.transform(&test_dataset)?;
54
55    println!("Feature Extraction:");
56    println!(
57        "  Train feature shape: ({}, {})",
58        train_features.nrows(),
59        train_features.ncols()
60    );
61    println!(
62        "  Test feature shape: ({}, {})",
63        test_features.nrows(),
64        test_features.ncols()
65    );
66    println!();
67
68    // Demonstrate feature selection
69    let mut feature_selector = TextFeatureSelector::new()
70        .set_max_features(10.0)?
71        .set_min_df(0.1)?
72        .set_max_df(0.9)?;
73
74    let selected_train_features = feature_selector.fit_transform(&train_features)?;
75    println!("Feature Selection:");
76    println!("  Selected features: {}", selected_train_features.ncols());
77    println!();
78
79    // Simulate classification results (in a real scenario, you'd use a classifier)
80    // For demo purposes, we'll create mock predictions based on simple heuristics
81    let _unique_labels = train_dataset.unique_labels();
82
83    // Create binary labels (0 for negative, 1 for positive) for this demo
84    let mut train_labels = Vec::new();
85    let mut test_labels = Vec::new();
86
87    for label in &train_dataset.labels {
88        train_labels.push(if label == "positive" { 1 } else { 0 });
89    }
90
91    for label in &test_dataset.labels {
92        test_labels.push(if label == "positive" { 1 } else { 0 });
93    }
94
95    // Mock predictions (in practice, use a real classifier)
96    let predictions = test_labels.clone(); // Perfect predictions for demo
97
98    // Calculate metrics
99    let metrics = TextClassificationMetrics::new();
100    let accuracy = metrics.accuracy(&predictions, &test_labels)?;
101    let (precision, recall, f1) = metrics.binary_metrics(&predictions, &test_labels)?;
102
103    println!("Classification Metrics:");
104    println!("  Accuracy: {:.2}%", accuracy * 100.0);
105    println!("  Precision: {:.2}%", precision * 100.0);
106    println!("  Recall: {:.2}%", recall * 100.0);
107    println!("  F1 Score: {:.2}%", f1 * 100.0);
108    println!();
109
110    // Create a simple confusion matrix manually since the method isn't available
111    let mut true_positive = 0;
112    let mut true_negative = 0;
113    let mut false_positive = 0;
114    let mut false_negative = 0;
115
116    for (pred, actual) in predictions.iter().zip(test_labels.iter()) {
117        match (pred, actual) {
118            (1, 1) => true_positive += 1,
119            (0, 0) => true_negative += 1,
120            (1, 0) => false_positive += 1,
121            (0, 1) => false_negative += 1,
122            _ => {}
123        }
124    }
125
126    println!("Confusion Matrix:");
127    println!("[ {true_negative} {false_positive} ]");
128    println!("[ {false_negative} {true_positive} ]");
129
130    Ok(())
131}
Source

pub fn is_empty(&self) -> bool

Check if the dataset is empty

Source

pub fn unique_labels(&self) -> Vec<String>

Get the unique labels in the dataset

Examples found in repository?
examples/text_classification_demo.rs (line 35)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Text Classification Demo");
10    println!("=======================\n");
11
12    // Create sample dataset
13    let texts = vec![
14        "This movie is absolutely fantastic and amazing!".to_string(),
15        "I really hated this film, it was terrible.".to_string(),
16        "The acting was superb and the plot was engaging.".to_string(),
17        "Worst movie I've ever seen, complete waste of time.".to_string(),
18        "A masterpiece of cinema, truly exceptional work.".to_string(),
19        "Boring, predictable, and poorly executed.".to_string(),
20    ];
21
22    let labels = vec![
23        "positive".to_string(),
24        "negative".to_string(),
25        "positive".to_string(),
26        "negative".to_string(),
27        "positive".to_string(),
28        "negative".to_string(),
29    ];
30
31    // Create dataset
32    let dataset = TextDataset::new(texts, labels)?;
33    println!("Dataset Statistics:");
34    println!("  Total samples: {}", dataset.len());
35    println!("  Number of classes: {}", dataset.unique_labels().len());
36    println!();
37
38    // Split into train and test
39    let (train_dataset, test_dataset) = dataset.train_test_split(0.33, Some(42))?;
40    println!("Train/Test Split:");
41    println!("  Training samples: {}", train_dataset.len());
42    println!("  Test samples: {}", test_dataset.len());
43    println!();
44
45    // Create text processing pipeline
46    let mut pipeline = TextClassificationPipeline::with_tfidf();
47
48    // Fit the pipeline
49    pipeline.fit(&train_dataset)?;
50
51    // Transform to features
52    let train_features = pipeline.transform(&train_dataset)?;
53    let test_features = pipeline.transform(&test_dataset)?;
54
55    println!("Feature Extraction:");
56    println!(
57        "  Train feature shape: ({}, {})",
58        train_features.nrows(),
59        train_features.ncols()
60    );
61    println!(
62        "  Test feature shape: ({}, {})",
63        test_features.nrows(),
64        test_features.ncols()
65    );
66    println!();
67
68    // Demonstrate feature selection
69    let mut feature_selector = TextFeatureSelector::new()
70        .set_max_features(10.0)?
71        .set_min_df(0.1)?
72        .set_max_df(0.9)?;
73
74    let selected_train_features = feature_selector.fit_transform(&train_features)?;
75    println!("Feature Selection:");
76    println!("  Selected features: {}", selected_train_features.ncols());
77    println!();
78
79    // Simulate classification results (in a real scenario, you'd use a classifier)
80    // For demo purposes, we'll create mock predictions based on simple heuristics
81    let _unique_labels = train_dataset.unique_labels();
82
83    // Create binary labels (0 for negative, 1 for positive) for this demo
84    let mut train_labels = Vec::new();
85    let mut test_labels = Vec::new();
86
87    for label in &train_dataset.labels {
88        train_labels.push(if label == "positive" { 1 } else { 0 });
89    }
90
91    for label in &test_dataset.labels {
92        test_labels.push(if label == "positive" { 1 } else { 0 });
93    }
94
95    // Mock predictions (in practice, use a real classifier)
96    let predictions = test_labels.clone(); // Perfect predictions for demo
97
98    // Calculate metrics
99    let metrics = TextClassificationMetrics::new();
100    let accuracy = metrics.accuracy(&predictions, &test_labels)?;
101    let (precision, recall, f1) = metrics.binary_metrics(&predictions, &test_labels)?;
102
103    println!("Classification Metrics:");
104    println!("  Accuracy: {:.2}%", accuracy * 100.0);
105    println!("  Precision: {:.2}%", precision * 100.0);
106    println!("  Recall: {:.2}%", recall * 100.0);
107    println!("  F1 Score: {:.2}%", f1 * 100.0);
108    println!();
109
110    // Create a simple confusion matrix manually since the method isn't available
111    let mut true_positive = 0;
112    let mut true_negative = 0;
113    let mut false_positive = 0;
114    let mut false_negative = 0;
115
116    for (pred, actual) in predictions.iter().zip(test_labels.iter()) {
117        match (pred, actual) {
118            (1, 1) => true_positive += 1,
119            (0, 0) => true_negative += 1,
120            (1, 0) => false_positive += 1,
121            (0, 1) => false_negative += 1,
122            _ => {}
123        }
124    }
125
126    println!("Confusion Matrix:");
127    println!("[ {true_negative} {false_positive} ]");
128    println!("[ {false_negative} {true_positive} ]");
129
130    Ok(())
131}
Source

pub fn build_label_index(&mut self) -> Result<&mut Self>

Build a label index mapping

Source

pub fn get_label_indices(&self) -> Result<Vec<usize>>

Get label indices

Source

pub fn train_test_split( &self, test_size: f64, random_seed: Option<u64>, ) -> Result<(Self, Self)>

Split the dataset into train and test sets

Examples found in repository?
examples/text_classification_demo.rs (line 39)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Text Classification Demo");
10    println!("=======================\n");
11
12    // Create sample dataset
13    let texts = vec![
14        "This movie is absolutely fantastic and amazing!".to_string(),
15        "I really hated this film, it was terrible.".to_string(),
16        "The acting was superb and the plot was engaging.".to_string(),
17        "Worst movie I've ever seen, complete waste of time.".to_string(),
18        "A masterpiece of cinema, truly exceptional work.".to_string(),
19        "Boring, predictable, and poorly executed.".to_string(),
20    ];
21
22    let labels = vec![
23        "positive".to_string(),
24        "negative".to_string(),
25        "positive".to_string(),
26        "negative".to_string(),
27        "positive".to_string(),
28        "negative".to_string(),
29    ];
30
31    // Create dataset
32    let dataset = TextDataset::new(texts, labels)?;
33    println!("Dataset Statistics:");
34    println!("  Total samples: {}", dataset.len());
35    println!("  Number of classes: {}", dataset.unique_labels().len());
36    println!();
37
38    // Split into train and test
39    let (train_dataset, test_dataset) = dataset.train_test_split(0.33, Some(42))?;
40    println!("Train/Test Split:");
41    println!("  Training samples: {}", train_dataset.len());
42    println!("  Test samples: {}", test_dataset.len());
43    println!();
44
45    // Create text processing pipeline
46    let mut pipeline = TextClassificationPipeline::with_tfidf();
47
48    // Fit the pipeline
49    pipeline.fit(&train_dataset)?;
50
51    // Transform to features
52    let train_features = pipeline.transform(&train_dataset)?;
53    let test_features = pipeline.transform(&test_dataset)?;
54
55    println!("Feature Extraction:");
56    println!(
57        "  Train feature shape: ({}, {})",
58        train_features.nrows(),
59        train_features.ncols()
60    );
61    println!(
62        "  Test feature shape: ({}, {})",
63        test_features.nrows(),
64        test_features.ncols()
65    );
66    println!();
67
68    // Demonstrate feature selection
69    let mut feature_selector = TextFeatureSelector::new()
70        .set_max_features(10.0)?
71        .set_min_df(0.1)?
72        .set_max_df(0.9)?;
73
74    let selected_train_features = feature_selector.fit_transform(&train_features)?;
75    println!("Feature Selection:");
76    println!("  Selected features: {}", selected_train_features.ncols());
77    println!();
78
79    // Simulate classification results (in a real scenario, you'd use a classifier)
80    // For demo purposes, we'll create mock predictions based on simple heuristics
81    let _unique_labels = train_dataset.unique_labels();
82
83    // Create binary labels (0 for negative, 1 for positive) for this demo
84    let mut train_labels = Vec::new();
85    let mut test_labels = Vec::new();
86
87    for label in &train_dataset.labels {
88        train_labels.push(if label == "positive" { 1 } else { 0 });
89    }
90
91    for label in &test_dataset.labels {
92        test_labels.push(if label == "positive" { 1 } else { 0 });
93    }
94
95    // Mock predictions (in practice, use a real classifier)
96    let predictions = test_labels.clone(); // Perfect predictions for demo
97
98    // Calculate metrics
99    let metrics = TextClassificationMetrics::new();
100    let accuracy = metrics.accuracy(&predictions, &test_labels)?;
101    let (precision, recall, f1) = metrics.binary_metrics(&predictions, &test_labels)?;
102
103    println!("Classification Metrics:");
104    println!("  Accuracy: {:.2}%", accuracy * 100.0);
105    println!("  Precision: {:.2}%", precision * 100.0);
106    println!("  Recall: {:.2}%", recall * 100.0);
107    println!("  F1 Score: {:.2}%", f1 * 100.0);
108    println!();
109
110    // Create a simple confusion matrix manually since the method isn't available
111    let mut true_positive = 0;
112    let mut true_negative = 0;
113    let mut false_positive = 0;
114    let mut false_negative = 0;
115
116    for (pred, actual) in predictions.iter().zip(test_labels.iter()) {
117        match (pred, actual) {
118            (1, 1) => true_positive += 1,
119            (0, 0) => true_negative += 1,
120            (1, 0) => false_positive += 1,
121            (0, 1) => false_negative += 1,
122            _ => {}
123        }
124    }
125
126    println!("Confusion Matrix:");
127    println!("[ {true_negative} {false_positive} ]");
128    println!("[ {false_negative} {true_positive} ]");
129
130    Ok(())
131}
More examples
Hide additional examples
examples/ml_integration_demo.rs (line 120)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Machine Learning Integration Demo");
10    println!("================================\n");
11
12    // Sample dataset for demonstration
13    let texts = [
14        "This product is absolutely amazing! I love it.",
15        "Terrible experience, would not recommend.",
16        "It's okay, nothing special but works fine.",
17        "Excellent quality and fast shipping.",
18        "Complete waste of money, very disappointed.",
19        "Good value for the price, satisfied with purchase.",
20        "Outstanding service and great product!",
21        "Not worth it, many issues with this item.",
22    ];
23
24    let labels = [
25        "positive", "negative", "neutral", "positive", "negative", "positive", "positive",
26        "negative",
27    ];
28
29    // Create dataset
30    let dataset = TextDataset::new(
31        texts.iter().map(|s| s.to_string()).collect(),
32        labels.iter().map(|s| s.to_string()).collect(),
33    )?;
34
35    // Demonstrate different feature extraction modes
36    println!("1. TF-IDF Feature Extraction");
37    println!("---------------------------");
38
39    let mut tfidf_processor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf)
40        .with_tfidf_params(0.1, 0.9, Some(100));
41
42    let text_refs = texts.to_vec();
43    let tfidf_features = tfidf_processor.fit_transform(&text_refs)?;
44
45    println!(
46        "TF-IDF Features shape: {:?}",
47        tfidf_features.features.shape()
48    );
49    println!(
50        "First document features (first 5 values): {:?}\n",
51        &tfidf_features
52            .features
53            .row(0)
54            .iter()
55            .take(5)
56            .collect::<Vec<_>>()
57    );
58
59    // Topic modeling features
60    println!("2. Topic Modeling Features");
61    println!("-------------------------");
62
63    let mut topic_processor =
64        MLTextPreprocessor::new(FeatureExtractionMode::TopicModeling).with_topic_modeling(3);
65
66    let topic_features = topic_processor.fit_transform(&text_refs)?;
67
68    println!(
69        "Topic Features shape: {:?}",
70        topic_features.features.shape()
71    );
72    println!(
73        "Topic distribution for first document: {:?}\n",
74        topic_features.features.row(0)
75    );
76
77    // Combined features
78    println!("3. Combined Features");
79    println!("-------------------");
80
81    let mut combined_processor = MLTextPreprocessor::new(FeatureExtractionMode::Combined);
82    let combined_features = combined_processor.fit_transform(&text_refs)?;
83
84    println!(
85        "Combined Features shape: {:?}",
86        combined_features.features.shape()
87    );
88    println!("Metadata: {:?}\n", combined_features.metadata);
89
90    // ML Pipeline
91    println!("4. ML Pipeline with Classification");
92    println!("---------------------------------");
93
94    let mut pipeline = TextMLPipeline::with_mode(FeatureExtractionMode::TfIdf)
95        .configure_preprocessor(|p| {
96            p.with_tfidf_params(0.0, 1.0, Some(50))
97                .with_feature_selection(20)
98        });
99
100    let features = pipeline.process(&text_refs)?;
101    println!("Pipeline features shape: {:?}", features.features.shape());
102
103    // Batch processing for large datasets
104    println!("\n5. Batch Processing");
105    println!("-------------------");
106
107    let mut batch_processor = BatchTextProcessor::new(3);
108    let batches = batch_processor.process_batches(&text_refs)?;
109
110    println!("Number of batches: {}", batches.len());
111    for (i, batch) in batches.iter().enumerate() {
112        println!("Batch {} shape: {:?}", i + 1, batch.features.shape());
113    }
114
115    // Feature extraction for classification
116    println!("\n6. Classification with ML Features");
117    println!("----------------------------------");
118
119    // Split data
120    let (train_dataset, test_dataset) = dataset.train_test_split(0.25, Some(42))?;
121
122    // Extract features
123    let traintexts: Vec<&str> = train_dataset.texts.iter().map(|s| s.as_ref()).collect();
124    let testtexts: Vec<&str> = test_dataset.texts.iter().map(|s| s.as_ref()).collect();
125
126    let mut feature_extractor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf);
127    feature_extractor.fit(&traintexts)?;
128
129    let train_features = feature_extractor.transform(&traintexts)?;
130    let test_features = feature_extractor.transform(&testtexts)?;
131
132    println!("Training features: {:?}", train_features.features.shape());
133    println!("Test features: {:?}", test_features.features.shape());
134
135    // In a real scenario, you would now use these features with a classifier
136    println!("\nFeatures are ready for machine learning models!");
137
138    // Demonstrate feature statistics
139    println!("\n7. Feature Statistics");
140    println!("--------------------");
141
142    let feature_means = train_features
143        .features
144        .mean_axis(scirs2_core::ndarray::Axis(0))
145        .unwrap();
146    let feature_stds = train_features
147        .features
148        .std_axis(scirs2_core::ndarray::Axis(0), 0.0);
149
150    println!(
151        "Mean of first 5 features: {:?}",
152        &feature_means.iter().take(5).collect::<Vec<_>>()
153    );
154    println!(
155        "Std of first 5 features: {:?}",
156        &feature_stds.iter().take(5).collect::<Vec<_>>()
157    );
158
159    Ok(())
160}

Trait Implementations§

Source§

impl Clone for TextDataset

Source§

fn clone(&self) -> TextDataset

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for TextDataset

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<SS, SP> SupersetOf<SS> for SP
where SS: SubsetOf<SP>,

Source§

fn to_subset(&self) -> Option<SS>

The inverse inclusion map: attempts to construct self from the equivalent element of its superset. Read more
Source§

fn is_in_subset(&self) -> bool

Checks if self is actually part of its subset T (and can be converted to it).
Source§

fn to_subset_unchecked(&self) -> SS

Use with care! Same as self.to_subset but without any property checks. Always succeeds.
Source§

fn from_subset(element: &SS) -> SP

The inclusion map: converts self to the equivalent element of its superset.
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V