Struct CountVectorizer

Source

pub struct CountVectorizer { /* private fields */ }

Expand description

Converts text documents into a sparse term-count matrix.

Each document becomes a row, each unique token a column. Cell values are the number of times that token appears in that document.

§Example

use scry_learn::text::CountVectorizer;

let mut cv = CountVectorizer::new();
let docs = ["the cat sat", "the dog sat", "the cat played"];
let matrix = cv.fit_transform(&docs);

assert_eq!(matrix.n_rows(), 3);
assert_eq!(matrix.n_cols(), cv.vocabulary().len());

Implementations§

Source §

impl CountVectorizer

Source

pub fn new() -> Self

Create a new CountVectorizer with default settings.

Source

pub fn min_df(self, n: usize) -> Self

Set minimum document frequency (absolute). Tokens appearing in fewer documents are excluded. Default: 1.

Source

pub fn max_df(self, frac: f64) -> Self

Set maximum document frequency as a fraction in (0.0, 1.0]. Tokens appearing in more than this fraction of documents are excluded. Default: 1.0 (no filtering).

Source

pub fn ngram_range(self, min_n: usize, max_n: usize) -> Self

Set n-gram range. Default: (1, 1) (unigrams only).

Source

pub fn max_features(self, n: usize) -> Self

Limit vocabulary to the top n features by total frequency. Default: no limit.

Source

pub fn binary(self, b: bool) -> Self

If true, all non-zero counts become 1 (presence/absence). Default: false.

Source

pub fn fit<S: AsRef<str>>(&mut self, documents: &[S])

Learn vocabulary from documents.

Source

pub fn transform<S: AsRef<str>>(&self, documents: &[S]) -> CsrMatrix

Transform documents into a sparse CSR matrix of counts.

Panics if fit() has not been called.

Source

pub fn fit_transform<S: AsRef<str>>(&mut self, documents: &[S]) -> CsrMatrix

Fit the vocabulary and transform in one step.

Source

pub fn vocabulary(&self) -> &HashMap<String, usize>

Return the learned vocabulary (token → column index).

Source

pub fn get_feature_names(&self) -> Vec<String>

Return feature names sorted by column index.

Source

pub fn n_features(&self) -> usize

Number of features in the vocabulary.

Source

pub fn is_fitted(&self) -> bool

Whether fit() has been called.

Trait Implementations§

Source §

impl Clone for CountVectorizer

Source §

fn clone(&self) -> CountVectorizer

Returns a duplicate of the value. Read more

1.0.0 (const: unstable) · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for CountVectorizer

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Default for CountVectorizer

Source §

fn default() -> Self

Returns the “default value” for a type. Read more

Auto Trait Implementations§

§

impl UnwindSafe for CountVectorizer

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

impl<T> Pointable for T

Source §

const ALIGN: usize

The alignment of pointer.

Source §

type Init = T

The type for initializers.

Source §

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

Source §

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

Source §

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

Source §

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

Source §

impl<T> ToOwned for T
where T: Clone,

Source §

type Owned = T

The resulting type after obtaining ownership.

Source §

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more

Source §

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct CountVectorizer Copy item path

§Example

Implementations§

impl CountVectorizer

pub fn new() -> Self

pub fn min_df(self, n: usize) -> Self

pub fn max_df(self, frac: f64) -> Self

pub fn ngram_range(self, min_n: usize, max_n: usize) -> Self

pub fn max_features(self, n: usize) -> Self

pub fn binary(self, b: bool) -> Self

pub fn fit<S: AsRef<str>>(&mut self, documents: &[S])

pub fn transform<S: AsRef<str>>(&self, documents: &[S]) -> CsrMatrix

pub fn fit_transform<S: AsRef<str>>(&mut self, documents: &[S]) -> CsrMatrix

pub fn vocabulary(&self) -> &HashMap<String, usize>

pub fn get_feature_names(&self) -> Vec<String>

pub fn n_features(&self) -> usize

pub fn is_fitted(&self) -> bool

Trait Implementations§

impl Clone for CountVectorizer

fn clone(&self) -> CountVectorizer

fn clone_from(&mut self, source: &Self)

impl Debug for CountVectorizer

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for CountVectorizer

fn default() -> Self

Auto Trait Implementations§

impl Freeze for CountVectorizer

impl RefUnwindSafe for CountVectorizer

impl Send for CountVectorizer

impl Sync for CountVectorizer

impl Unpin for CountVectorizer

impl UnsafeUnpin for CountVectorizer

impl UnwindSafe for CountVectorizer

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct CountVectorizer

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,