pub struct NgramTokenizer { /* private fields */ }Expand description
Tokenizer for extracting n-grams from text
Implementations§
Source§impl NgramTokenizer
impl NgramTokenizer
Sourcepub fn new(n: usize) -> Result<Self>
pub fn new(n: usize) -> Result<Self>
Create a new n-gram tokenizer
Examples found in repository?
examples/text_processing_demo.rs (line 51)
14fn main() -> Result<(), Box<dyn std::error::Error>> {
15 println!("=== SciRS2 Text Processing Demo ===\n");
16
17 let documents = [
18 "The quick brown fox jumps over the lazy dog.",
19 "A fast red fox leaped over the sleeping canine.",
20 "Machine learning algorithms process textual data efficiently.",
21 "Text processing and natural language understanding are important.",
22 ];
23
24 // 1. Text Normalization
25 println!("1. Text Normalization");
26 let normalizer = BasicNormalizer::new(true, true);
27 for (i, doc) in documents.iter().enumerate() {
28 let normalized = normalizer.normalize(doc)?;
29 println!("Doc {}: {}", i + 1, normalized);
30 }
31 println!();
32
33 // 2. Text Cleaning
34 println!("2. Text Cleaning");
35 let cleaner = BasicTextCleaner::new(true, true, true);
36 for (i, doc) in documents.iter().enumerate() {
37 let cleaned = cleaner.clean(doc)?;
38 println!("Doc {}: {}", i + 1, cleaned);
39 }
40 println!();
41
42 // 3. Tokenization Examples
43 println!("3. Tokenization Examples");
44
45 // Word tokenization
46 let word_tokenizer = WordTokenizer::new(true);
47 let tokens = word_tokenizer.tokenize(documents[0])?;
48 println!("Word tokens: {tokens:?}");
49
50 // N-gram tokenization
51 let ngram_tokenizer = NgramTokenizer::new(2)?;
52 let ngrams = ngram_tokenizer.tokenize(documents[0])?;
53 println!("2-grams: {ngrams:?}");
54
55 // Regex tokenization
56 let regex_tokenizer = RegexTokenizer::new(r"\b\w+\b", false)?;
57 let regex_tokens = regex_tokenizer.tokenize(documents[0])?;
58 println!("Regex tokens: {regex_tokens:?}");
59 println!();
60
61 // 4. Stemming and Lemmatization
62 println!("4. Stemming and Lemmatization");
63 let porter_stemmer = PorterStemmer::new();
64 let lemmatizer = SimpleLemmatizer::new();
65
66 let test_words = vec!["running", "jumped", "better", "processing"];
67 for word in test_words {
68 let stemmed = porter_stemmer.stem(word)?;
69 let lemmatized = lemmatizer.stem(word)?;
70 println!("{word}: stemmed={stemmed}, lemmatized={lemmatized}");
71 }
72 println!();
73
74 // 5. Count Vectorization
75 println!("5. Count Vectorization");
76 let mut count_vectorizer = CountVectorizer::new(false);
77
78 let doc_refs = documents.to_vec();
79 count_vectorizer.fit(&doc_refs)?;
80
81 // Transform individual documents
82 let count_matrix = count_vectorizer.transform_batch(&doc_refs)?;
83 println!("Count vector shape: {:?}", count_matrix.shape());
84 println!("Vocabulary size: {}", count_vectorizer.vocabulary().len());
85
86 println!();
87
88 // 6. TF-IDF Vectorization
89 println!("6. TF-IDF Vectorization");
90 let mut tfidf_vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
91
92 tfidf_vectorizer.fit(&doc_refs)?;
93 let tfidf_matrix = tfidf_vectorizer.transform_batch(&doc_refs)?;
94
95 println!("TF-IDF vector shape: {:?}", tfidf_matrix.shape());
96 println!("Sample TF-IDF values:");
97 for i in 0..3.min(tfidf_matrix.nrows()) {
98 for j in 0..5.min(tfidf_matrix.ncols()) {
99 print!("{:.3} ", tfidf_matrix[[i, j]]);
100 }
101 println!();
102 }
103 println!();
104
105 // 7. Complete Pipeline Example
106 println!("7. Complete Text Processing Pipeline");
107 let testtext = "The cats were running quickly through the gardens.";
108
109 // Normalize
110 let normalized = normalizer.normalize(testtext)?;
111 println!("Normalized: {normalized}");
112
113 // Clean
114 let cleaned = cleaner.clean(&normalized)?;
115 println!("Cleaned: {cleaned}");
116
117 // Tokenize
118 let tokens = word_tokenizer.tokenize(&cleaned)?;
119 println!("Tokens: {tokens:?}");
120
121 // Stem
122 let stemmed_tokens: Result<Vec<_>, _> = tokens
123 .iter()
124 .map(|token| porter_stemmer.stem(token))
125 .collect();
126 let stemmed_tokens = stemmed_tokens?;
127 println!("Stemmed: {stemmed_tokens:?}");
128
129 Ok(())
130}Sourcepub fn with_range(_min_n: usize, maxn: usize) -> Result<Self>
pub fn with_range(_min_n: usize, maxn: usize) -> Result<Self>
Create an n-gram tokenizer with a range of n values
Sourcepub fn only_alphanumeric(self, value: bool) -> Self
pub fn only_alphanumeric(self, value: bool) -> Self
Set whether to only include alphanumeric tokens
Sourcepub fn with_separator(self, separator: String) -> Self
pub fn with_separator(self, separator: String) -> Self
Set the separator for n-grams
Trait Implementations§
Source§impl Clone for NgramTokenizer
impl Clone for NgramTokenizer
Source§fn clone(&self) -> NgramTokenizer
fn clone(&self) -> NgramTokenizer
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for NgramTokenizer
impl Debug for NgramTokenizer
Auto Trait Implementations§
impl Freeze for NgramTokenizer
impl RefUnwindSafe for NgramTokenizer
impl Send for NgramTokenizer
impl Sync for NgramTokenizer
impl Unpin for NgramTokenizer
impl UnwindSafe for NgramTokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.