BatchTextProcessor

Struct BatchTextProcessor 

Source
pub struct BatchTextProcessor { /* private fields */ }
Expand description

Batch text processor for large datasets

Implementations§

Source§

impl BatchTextProcessor

Source

pub fn new(batchsize: usize) -> Self

Create a new batch processor

Examples found in repository?
examples/ml_integration_demo.rs (line 107)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Machine Learning Integration Demo");
10    println!("================================\n");
11
12    // Sample dataset for demonstration
13    let texts = [
14        "This product is absolutely amazing! I love it.",
15        "Terrible experience, would not recommend.",
16        "It's okay, nothing special but works fine.",
17        "Excellent quality and fast shipping.",
18        "Complete waste of money, very disappointed.",
19        "Good value for the price, satisfied with purchase.",
20        "Outstanding service and great product!",
21        "Not worth it, many issues with this item.",
22    ];
23
24    let labels = [
25        "positive", "negative", "neutral", "positive", "negative", "positive", "positive",
26        "negative",
27    ];
28
29    // Create dataset
30    let dataset = TextDataset::new(
31        texts.iter().map(|s| s.to_string()).collect(),
32        labels.iter().map(|s| s.to_string()).collect(),
33    )?;
34
35    // Demonstrate different feature extraction modes
36    println!("1. TF-IDF Feature Extraction");
37    println!("---------------------------");
38
39    let mut tfidf_processor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf)
40        .with_tfidf_params(0.1, 0.9, Some(100));
41
42    let text_refs = texts.to_vec();
43    let tfidf_features = tfidf_processor.fit_transform(&text_refs)?;
44
45    println!(
46        "TF-IDF Features shape: {:?}",
47        tfidf_features.features.shape()
48    );
49    println!(
50        "First document features (first 5 values): {:?}\n",
51        &tfidf_features
52            .features
53            .row(0)
54            .iter()
55            .take(5)
56            .collect::<Vec<_>>()
57    );
58
59    // Topic modeling features
60    println!("2. Topic Modeling Features");
61    println!("-------------------------");
62
63    let mut topic_processor =
64        MLTextPreprocessor::new(FeatureExtractionMode::TopicModeling).with_topic_modeling(3);
65
66    let topic_features = topic_processor.fit_transform(&text_refs)?;
67
68    println!(
69        "Topic Features shape: {:?}",
70        topic_features.features.shape()
71    );
72    println!(
73        "Topic distribution for first document: {:?}\n",
74        topic_features.features.row(0)
75    );
76
77    // Combined features
78    println!("3. Combined Features");
79    println!("-------------------");
80
81    let mut combined_processor = MLTextPreprocessor::new(FeatureExtractionMode::Combined);
82    let combined_features = combined_processor.fit_transform(&text_refs)?;
83
84    println!(
85        "Combined Features shape: {:?}",
86        combined_features.features.shape()
87    );
88    println!("Metadata: {:?}\n", combined_features.metadata);
89
90    // ML Pipeline
91    println!("4. ML Pipeline with Classification");
92    println!("---------------------------------");
93
94    let mut pipeline = TextMLPipeline::with_mode(FeatureExtractionMode::TfIdf)
95        .configure_preprocessor(|p| {
96            p.with_tfidf_params(0.0, 1.0, Some(50))
97                .with_feature_selection(20)
98        });
99
100    let features = pipeline.process(&text_refs)?;
101    println!("Pipeline features shape: {:?}", features.features.shape());
102
103    // Batch processing for large datasets
104    println!("\n5. Batch Processing");
105    println!("-------------------");
106
107    let mut batch_processor = BatchTextProcessor::new(3);
108    let batches = batch_processor.process_batches(&text_refs)?;
109
110    println!("Number of batches: {}", batches.len());
111    for (i, batch) in batches.iter().enumerate() {
112        println!("Batch {} shape: {:?}", i + 1, batch.features.shape());
113    }
114
115    // Feature extraction for classification
116    println!("\n6. Classification with ML Features");
117    println!("----------------------------------");
118
119    // Split data
120    let (train_dataset, test_dataset) = dataset.train_test_split(0.25, Some(42))?;
121
122    // Extract features
123    let traintexts: Vec<&str> = train_dataset.texts.iter().map(|s| s.as_ref()).collect();
124    let testtexts: Vec<&str> = test_dataset.texts.iter().map(|s| s.as_ref()).collect();
125
126    let mut feature_extractor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf);
127    feature_extractor.fit(&traintexts)?;
128
129    let train_features = feature_extractor.transform(&traintexts)?;
130    let test_features = feature_extractor.transform(&testtexts)?;
131
132    println!("Training features: {:?}", train_features.features.shape());
133    println!("Test features: {:?}", test_features.features.shape());
134
135    // In a real scenario, you would now use these features with a classifier
136    println!("\nFeatures are ready for machine learning models!");
137
138    // Demonstrate feature statistics
139    println!("\n7. Feature Statistics");
140    println!("--------------------");
141
142    let feature_means = train_features
143        .features
144        .mean_axis(scirs2_core::ndarray::Axis(0))
145        .unwrap();
146    let feature_stds = train_features
147        .features
148        .std_axis(scirs2_core::ndarray::Axis(0), 0.0);
149
150    println!(
151        "Mean of first 5 features: {:?}",
152        &feature_means.iter().take(5).collect::<Vec<_>>()
153    );
154    println!(
155        "Std of first 5 features: {:?}",
156        &feature_stds.iter().take(5).collect::<Vec<_>>()
157    );
158
159    Ok(())
160}
Source

pub fn process_batches(&mut self, texts: &[&str]) -> Result<Vec<TextFeatures>>

Process texts in batches

Examples found in repository?
examples/ml_integration_demo.rs (line 108)
8fn main() -> Result<(), Box<dyn std::error::Error>> {
9    println!("Machine Learning Integration Demo");
10    println!("================================\n");
11
12    // Sample dataset for demonstration
13    let texts = [
14        "This product is absolutely amazing! I love it.",
15        "Terrible experience, would not recommend.",
16        "It's okay, nothing special but works fine.",
17        "Excellent quality and fast shipping.",
18        "Complete waste of money, very disappointed.",
19        "Good value for the price, satisfied with purchase.",
20        "Outstanding service and great product!",
21        "Not worth it, many issues with this item.",
22    ];
23
24    let labels = [
25        "positive", "negative", "neutral", "positive", "negative", "positive", "positive",
26        "negative",
27    ];
28
29    // Create dataset
30    let dataset = TextDataset::new(
31        texts.iter().map(|s| s.to_string()).collect(),
32        labels.iter().map(|s| s.to_string()).collect(),
33    )?;
34
35    // Demonstrate different feature extraction modes
36    println!("1. TF-IDF Feature Extraction");
37    println!("---------------------------");
38
39    let mut tfidf_processor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf)
40        .with_tfidf_params(0.1, 0.9, Some(100));
41
42    let text_refs = texts.to_vec();
43    let tfidf_features = tfidf_processor.fit_transform(&text_refs)?;
44
45    println!(
46        "TF-IDF Features shape: {:?}",
47        tfidf_features.features.shape()
48    );
49    println!(
50        "First document features (first 5 values): {:?}\n",
51        &tfidf_features
52            .features
53            .row(0)
54            .iter()
55            .take(5)
56            .collect::<Vec<_>>()
57    );
58
59    // Topic modeling features
60    println!("2. Topic Modeling Features");
61    println!("-------------------------");
62
63    let mut topic_processor =
64        MLTextPreprocessor::new(FeatureExtractionMode::TopicModeling).with_topic_modeling(3);
65
66    let topic_features = topic_processor.fit_transform(&text_refs)?;
67
68    println!(
69        "Topic Features shape: {:?}",
70        topic_features.features.shape()
71    );
72    println!(
73        "Topic distribution for first document: {:?}\n",
74        topic_features.features.row(0)
75    );
76
77    // Combined features
78    println!("3. Combined Features");
79    println!("-------------------");
80
81    let mut combined_processor = MLTextPreprocessor::new(FeatureExtractionMode::Combined);
82    let combined_features = combined_processor.fit_transform(&text_refs)?;
83
84    println!(
85        "Combined Features shape: {:?}",
86        combined_features.features.shape()
87    );
88    println!("Metadata: {:?}\n", combined_features.metadata);
89
90    // ML Pipeline
91    println!("4. ML Pipeline with Classification");
92    println!("---------------------------------");
93
94    let mut pipeline = TextMLPipeline::with_mode(FeatureExtractionMode::TfIdf)
95        .configure_preprocessor(|p| {
96            p.with_tfidf_params(0.0, 1.0, Some(50))
97                .with_feature_selection(20)
98        });
99
100    let features = pipeline.process(&text_refs)?;
101    println!("Pipeline features shape: {:?}", features.features.shape());
102
103    // Batch processing for large datasets
104    println!("\n5. Batch Processing");
105    println!("-------------------");
106
107    let mut batch_processor = BatchTextProcessor::new(3);
108    let batches = batch_processor.process_batches(&text_refs)?;
109
110    println!("Number of batches: {}", batches.len());
111    for (i, batch) in batches.iter().enumerate() {
112        println!("Batch {} shape: {:?}", i + 1, batch.features.shape());
113    }
114
115    // Feature extraction for classification
116    println!("\n6. Classification with ML Features");
117    println!("----------------------------------");
118
119    // Split data
120    let (train_dataset, test_dataset) = dataset.train_test_split(0.25, Some(42))?;
121
122    // Extract features
123    let traintexts: Vec<&str> = train_dataset.texts.iter().map(|s| s.as_ref()).collect();
124    let testtexts: Vec<&str> = test_dataset.texts.iter().map(|s| s.as_ref()).collect();
125
126    let mut feature_extractor = MLTextPreprocessor::new(FeatureExtractionMode::TfIdf);
127    feature_extractor.fit(&traintexts)?;
128
129    let train_features = feature_extractor.transform(&traintexts)?;
130    let test_features = feature_extractor.transform(&testtexts)?;
131
132    println!("Training features: {:?}", train_features.features.shape());
133    println!("Test features: {:?}", test_features.features.shape());
134
135    // In a real scenario, you would now use these features with a classifier
136    println!("\nFeatures are ready for machine learning models!");
137
138    // Demonstrate feature statistics
139    println!("\n7. Feature Statistics");
140    println!("--------------------");
141
142    let feature_means = train_features
143        .features
144        .mean_axis(scirs2_core::ndarray::Axis(0))
145        .unwrap();
146    let feature_stds = train_features
147        .features
148        .std_axis(scirs2_core::ndarray::Axis(0), 0.0);
149
150    println!(
151        "Mean of first 5 features: {:?}",
152        &feature_means.iter().take(5).collect::<Vec<_>>()
153    );
154    println!(
155        "Std of first 5 features: {:?}",
156        &feature_stds.iter().take(5).collect::<Vec<_>>()
157    );
158
159    Ok(())
160}

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<SS, SP> SupersetOf<SS> for SP
where SS: SubsetOf<SP>,

Source§

fn to_subset(&self) -> Option<SS>

The inverse inclusion map: attempts to construct self from the equivalent element of its superset. Read more
Source§

fn is_in_subset(&self) -> bool

Checks if self is actually part of its subset T (and can be converted to it).
Source§

fn to_subset_unchecked(&self) -> SS

Use with care! Same as self.to_subset but without any property checks. Always succeeds.
Source§

fn from_subset(element: &SS) -> SP

The inclusion map: converts self to the equivalent element of its superset.
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V