pub trait TokenFilter {
// Required method
fn apply(&self, tokens: &[String]) -> Vec<String>;
// Provided method
fn filtertext(
&self,
text: &str,
tokenizer: &dyn Tokenizer,
) -> Result<String> { ... }
}Expand description
Trait for token filtering strategies
Required Methods§
Provided Methods§
Sourcefn filtertext(&self, text: &str, tokenizer: &dyn Tokenizer) -> Result<String>
fn filtertext(&self, text: &str, tokenizer: &dyn Tokenizer) -> Result<String>
Apply the filter directly to text
Examples found in repository?
examples/token_filtering_demo.rs (line 28)
8fn main() -> Result<()> {
9 println!("Token Filtering Demo");
10 println!("===================\n");
11
12 // Create a sample text
13 let text = "The quick brown fox jumps over the lazy dog. The fox is quick and brown.";
14 println!("Original text: {text}\n");
15
16 // Create a tokenizer
17 let tokenizer = WordTokenizer::default();
18 let tokens = tokenizer.tokenize(text)?;
19 println!("Tokenized: {tokens:?}\n");
20
21 // 1. Filter by length
22 println!("1. Length Filtering");
23 println!("------------------");
24 let length_filter = LengthFilter::new(4, 6);
25 let filtered = length_filter.apply(&tokens);
26 println!("Tokens with length 4-6: {filtered:?}");
27
28 let filteredtext = length_filter.filtertext(text, &tokenizer)?;
29 println!("Filtered text: {filteredtext}\n");
30
31 // 2. Filter by frequency
32 println!("2. Frequency Filtering");
33 println!("---------------------");
34
35 // Create token counts
36 let mut counts = HashMap::new();
37 for token in &tokens {
38 *counts.entry(token.clone()).or_insert(0) += 1;
39 }
40
41 // Print counts
42 println!("Token counts:");
43 for (token, count) in &counts {
44 println!(" {token} : {count}");
45 }
46
47 // Filter tokens that appear more than once
48 let freq_filter = FrequencyFilter::from_counts(counts.clone(), 2);
49 let filtered = freq_filter.apply(&tokens);
50 println!("\nTokens that appear 2+ times: {filtered:?}");
51
52 let filteredtext = freq_filter.filtertext(text, &tokenizer)?;
53 println!("Filtered text: {filteredtext}\n");
54
55 // 3. Filter by regex pattern
56 println!("3. Regex Filtering");
57 println!("-----------------");
58
59 // Keep only tokens that contain a vowel followed by 'w' or 'r'
60 let regex_filter = RegexFilter::new("[aeiou][wr]", true)?;
61 let filtered = regex_filter.apply(&tokens);
62 println!("Tokens containing a vowel followed by 'w' or 'r': {filtered:?}");
63
64 let filteredtext = regex_filter.filtertext(text, &tokenizer)?;
65 println!("Filtered text: {filteredtext}\n");
66
67 // 4. Stopwords filtering
68 println!("4. Stopwords Filtering");
69 println!("---------------------");
70
71 // Define common stopwords
72 let stopwords = vec![
73 "the".to_string(),
74 "is".to_string(),
75 "and".to_string(),
76 "over".to_string(),
77 "a".to_string(),
78 "an".to_string(),
79 ];
80
81 let stopwords_filter = StopwordsFilter::new(stopwords, true);
82 let filtered = stopwords_filter.apply(&tokens);
83 println!("Tokens with stopwords removed: {filtered:?}");
84
85 let filteredtext = stopwords_filter.filtertext(text, &tokenizer)?;
86 println!("Filtered text: {filteredtext}\n");
87
88 // 5. Composite filtering
89 println!("5. Composite Filtering");
90 println!("---------------------");
91
92 // Create separate filters
93 let length_filter = LengthFilter::new(3, 5);
94 let regex_filter = RegexFilter::new("^[a-z]", true)?;
95
96 // Apply filters sequentially
97 let filtered_by_length = length_filter.apply(&tokens);
98 let filtered = regex_filter.apply(&filtered_by_length);
99 println!("Tokens with length 3-5 AND starting with lowercase letter: {filtered:?}");
100
101 // First filter by length
102 let text_with_length = length_filter.filtertext(text, &tokenizer)?;
103
104 // Then apply regex filter to the already filtered text
105 let filteredtext = regex_filter.filtertext(&text_with_length, &tokenizer)?;
106
107 // We should see only words that are 3-5 chars AND start with lowercase
108 println!("Filtered text: {filteredtext}\n");
109
110 Ok(())
111}