pub struct CountVectorizer {
pub max_features: Option<usize>,
pub min_df: usize,
pub max_df: f64,
pub binary: bool,
pub lowercase: bool,
}Expand description
An unfitted count vectorizer.
Tokenizes documents by splitting on non-alphanumeric boundaries, builds a vocabulary sorted alphabetically, and transforms documents into a term-count matrix.
§Examples
use ferrolearn_preprocess::count_vectorizer::{CountVectorizer, FittedCountVectorizer};
let docs = vec![
"the cat sat".to_string(),
"the cat sat on the mat".to_string(),
];
let cv = CountVectorizer::new();
let fitted = cv.fit(&docs).unwrap();
let counts = fitted.transform(&docs).unwrap();
assert_eq!(counts.nrows(), 2);
assert_eq!(counts.ncols(), fitted.vocabulary().len());Fields§
§max_features: Option<usize>Maximum number of features (vocabulary size). None means no limit.
min_df: usizeMinimum document frequency (absolute count) for a term to be included.
max_df: f64Maximum document frequency as a fraction of total documents.
Terms appearing in more than max_df * n_docs documents are excluded.
binary: boolIf true, all counts are clipped to 0/1 (binary occurrence).
lowercase: boolIf true, lowercase all tokens before counting.
Implementations§
Source§impl CountVectorizer
impl CountVectorizer
Sourcepub fn max_features(self, n: usize) -> Self
pub fn max_features(self, n: usize) -> Self
Set the maximum number of features.
Sourcepub fn max_df(self, max_df: f64) -> Self
pub fn max_df(self, max_df: f64) -> Self
Set the maximum document frequency as a fraction of total documents.
Sourcepub fn fit(&self, docs: &[String]) -> Result<FittedCountVectorizer, FerroError>
pub fn fit(&self, docs: &[String]) -> Result<FittedCountVectorizer, FerroError>
Fit the vectorizer on a corpus of documents.
§Errors
Returns FerroError::InsufficientSamples if the corpus is empty.
Returns FerroError::InvalidParameter if max_df is not in (0, 1].
Trait Implementations§
Source§impl Clone for CountVectorizer
impl Clone for CountVectorizer
Source§fn clone(&self) -> CountVectorizer
fn clone(&self) -> CountVectorizer
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for CountVectorizer
impl Debug for CountVectorizer
Auto Trait Implementations§
impl Freeze for CountVectorizer
impl RefUnwindSafe for CountVectorizer
impl Send for CountVectorizer
impl Sync for CountVectorizer
impl Unpin for CountVectorizer
impl UnsafeUnpin for CountVectorizer
impl UnwindSafe for CountVectorizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> DistributionExt for Twhere
T: ?Sized,
impl<T> DistributionExt for Twhere
T: ?Sized,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
self to the equivalent element of its superset.Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
self is actually part of its subset T (and can be converted to it).Source§unsafe fn to_subset_unchecked(&self) -> SS
unsafe fn to_subset_unchecked(&self) -> SS
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
self to the equivalent element of its superset.