Struct linfa_preprocessing::CountVectorizerValidParams

source · [−]

pub struct CountVectorizerValidParams { /* private fields */ }

Expand description

Count vectorizer: learns a vocabulary from a sequence of documents (or file paths) and maps each vocabulary entry to an integer value, producing a FittedCountVectorizer that can be used to count the occurrences of each vocabulary entry in any sequence of documents. Alternatively a user-specified vocabulary can be used for fitting.

Attributes

If a user-defined vocabulary is used for fitting then the following attributes will not be considered during the fitting phase but they will still be used by the FittedCountVectorizer to transform any text to be examined.

split_regex: the regex espression used to split decuments into tokens. Defaults to r“\b\w\w+\b“, which selects “words”, using whitespaces and punctuation symbols as separators.
convert_to_lowercase: if true, all documents used for fitting will be converted to lowercase. Defaults to true.
n_gram_range: if set to (1,1) single tokens will be candidate vocabulary entries, if (2,2) then adjacent token pairs will be considered, if (1,2) then both single tokens and adjacent token pairs will be considered, and so on. The definition of token depends on the regex used fpr splitting the documents. The default value is (1,1).
normalize: if true, all charachters in the documents used for fitting will be normalized according to unicode’s NFKD normalization. Defaults to true.
document_frequency: specifies the minimum and maximum (relative) document frequencies that each vocabulary entry must satisfy. Defaults to (0., 1.) (i.e. 0% minimum and 100% maximum)
stopwords: optional list of entries to be excluded from the generated vocabulary. Defaults to None

Struct linfa_preprocessing::CountVectorizerValidParams

Attributes

Implementations

impl CountVectorizerValidParams

pub fn convert_to_lowercase(&self) -> bool

pub fn split_regex(&self) -> Ref<'_, Regex>

pub fn n_gram_range(&self) -> (usize, usize)

pub fn normalize(&self) -> bool

pub fn document_frequency(&self) -> (f32, f32)

pub fn stopwords(&self) -> &Option<HashSet<String>>

impl CountVectorizerValidParams

pub fn fit<T: ToString + Clone, D: Data<Elem = T>>( &self, x: &ArrayBase<D, Ix1>) -> Result<CountVectorizer>

pub fn fit_files<P: AsRef<Path>>( &self, input: &[P], encoding: EncodingRef, trap: DecoderTrap) -> Result<CountVectorizer>

pub fn fit_vocabulary<T: ToString>( &self, words: &[T]) -> Result<CountVectorizer>

Trait Implementations

impl Clone for CountVectorizerValidParams

fn clone(&self) -> CountVectorizerValidParams

fn clone_from(&mut self, source: &Self)

impl Debug for CountVectorizerValidParams

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Auto Trait Implementations

impl !RefUnwindSafe for CountVectorizerValidParams

impl Send for CountVectorizerValidParams

impl !Sync for CountVectorizerValidParams

impl Unpin for CountVectorizerValidParams

impl UnwindSafe for CountVectorizerValidParams

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for T where T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for T where T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for T where U: From<T>,

fn into(self) -> U

impl<T> ToOwned for T where T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for T where U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for T where U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for T where V: MultiLane<T>,

fn vzip(self) -> V

pub fn n_gram_range(&self) -> (usize, usize )

pub fn document_frequency(&self) -> (f32, f32 )

pub fn fit<T: ToString + Clone, D: Data<Elem = T>>(
&self,
x: &ArrayBase<D, Ix1>
) -> Result<CountVectorizer>

pub fn fit_files<P: AsRef<Path>>(
&self,
input: &[P],
encoding: EncodingRef,
trap: DecoderTrap
) -> Result<CountVectorizer>

pub fn fit_vocabulary<T: ToString>(
&self,
words: &[T]
) -> Result<CountVectorizer>

impl<T> Any for T where
T: 'static + ?Sized,

impl<T> Borrow<T> for T where
T: ?Sized,

impl<T> BorrowMut<T> for T where
T: ?Sized,

impl<T, U> Into<U> for T where
U: From<T>,

impl<T> ToOwned for T where
T: Clone,

impl<T, U> TryFrom<U> for T where
U: Into<T>,

impl<T, U> TryInto<U> for T where
U: TryFrom<T>,

impl<V, T> VZip<V> for T where
V: MultiLane<T>,