pub struct LengthFilter {
pub min_length: usize,
pub max_length: usize,
}Expand description
Filter tokens by length
Fields§
§min_length: usizeMinimum token length
max_length: usizeMaximum token length
Implementations§
Source§impl LengthFilter
impl LengthFilter
Sourcepub fn new(_min_length: usize, maxlength: usize) -> Self
pub fn new(_min_length: usize, maxlength: usize) -> Self
Create a new length filter
Examples found in repository?
examples/token_filtering_demo.rs (line 24)
8fn main() -> Result<()> {
9 println!("Token Filtering Demo");
10 println!("===================\n");
11
12 // Create a sample text
13 let text = "The quick brown fox jumps over the lazy dog. The fox is quick and brown.";
14 println!("Original text: {text}\n");
15
16 // Create a tokenizer
17 let tokenizer = WordTokenizer::default();
18 let tokens = tokenizer.tokenize(text)?;
19 println!("Tokenized: {tokens:?}\n");
20
21 // 1. Filter by length
22 println!("1. Length Filtering");
23 println!("------------------");
24 let length_filter = LengthFilter::new(4, 6);
25 let filtered = length_filter.apply(&tokens);
26 println!("Tokens with length 4-6: {filtered:?}");
27
28 let filteredtext = length_filter.filtertext(text, &tokenizer)?;
29 println!("Filtered text: {filteredtext}\n");
30
31 // 2. Filter by frequency
32 println!("2. Frequency Filtering");
33 println!("---------------------");
34
35 // Create token counts
36 let mut counts = HashMap::new();
37 for token in &tokens {
38 *counts.entry(token.clone()).or_insert(0) += 1;
39 }
40
41 // Print counts
42 println!("Token counts:");
43 for (token, count) in &counts {
44 println!(" {token} : {count}");
45 }
46
47 // Filter tokens that appear more than once
48 let freq_filter = FrequencyFilter::from_counts(counts.clone(), 2);
49 let filtered = freq_filter.apply(&tokens);
50 println!("\nTokens that appear 2+ times: {filtered:?}");
51
52 let filteredtext = freq_filter.filtertext(text, &tokenizer)?;
53 println!("Filtered text: {filteredtext}\n");
54
55 // 3. Filter by regex pattern
56 println!("3. Regex Filtering");
57 println!("-----------------");
58
59 // Keep only tokens that contain a vowel followed by 'w' or 'r'
60 let regex_filter = RegexFilter::new("[aeiou][wr]", true)?;
61 let filtered = regex_filter.apply(&tokens);
62 println!("Tokens containing a vowel followed by 'w' or 'r': {filtered:?}");
63
64 let filteredtext = regex_filter.filtertext(text, &tokenizer)?;
65 println!("Filtered text: {filteredtext}\n");
66
67 // 4. Stopwords filtering
68 println!("4. Stopwords Filtering");
69 println!("---------------------");
70
71 // Define common stopwords
72 let stopwords = vec![
73 "the".to_string(),
74 "is".to_string(),
75 "and".to_string(),
76 "over".to_string(),
77 "a".to_string(),
78 "an".to_string(),
79 ];
80
81 let stopwords_filter = StopwordsFilter::new(stopwords, true);
82 let filtered = stopwords_filter.apply(&tokens);
83 println!("Tokens with stopwords removed: {filtered:?}");
84
85 let filteredtext = stopwords_filter.filtertext(text, &tokenizer)?;
86 println!("Filtered text: {filteredtext}\n");
87
88 // 5. Composite filtering
89 println!("5. Composite Filtering");
90 println!("---------------------");
91
92 // Create separate filters
93 let length_filter = LengthFilter::new(3, 5);
94 let regex_filter = RegexFilter::new("^[a-z]", true)?;
95
96 // Apply filters sequentially
97 let filtered_by_length = length_filter.apply(&tokens);
98 let filtered = regex_filter.apply(&filtered_by_length);
99 println!("Tokens with length 3-5 AND starting with lowercase letter: {filtered:?}");
100
101 // First filter by length
102 let text_with_length = length_filter.filtertext(text, &tokenizer)?;
103
104 // Then apply regex filter to the already filtered text
105 let filteredtext = regex_filter.filtertext(&text_with_length, &tokenizer)?;
106
107 // We should see only words that are 3-5 chars AND start with lowercase
108 println!("Filtered text: {filteredtext}\n");
109
110 Ok(())
111}Sourcepub fn with_min_length(self, minlength: usize) -> Self
pub fn with_min_length(self, minlength: usize) -> Self
Set minimum token length
Sourcepub fn with_max_length(self, maxlength: usize) -> Self
pub fn with_max_length(self, maxlength: usize) -> Self
Set maximum token length
Trait Implementations§
Source§impl Clone for LengthFilter
impl Clone for LengthFilter
Source§fn clone(&self) -> LengthFilter
fn clone(&self) -> LengthFilter
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for LengthFilter
impl Debug for LengthFilter
Source§impl Default for LengthFilter
impl Default for LengthFilter
Auto Trait Implementations§
impl Freeze for LengthFilter
impl RefUnwindSafe for LengthFilter
impl Send for LengthFilter
impl Sync for LengthFilter
impl Unpin for LengthFilter
impl UnwindSafe for LengthFilter
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.