pub struct WhitespaceTokenizer;Expand description
Whitespace tokenizer that splits on any whitespace character
Implementations§
Source§impl WhitespaceTokenizer
impl WhitespaceTokenizer
Sourcepub fn new() -> Self
pub fn new() -> Self
Create a new whitespace tokenizer
Examples found in repository?
examples/topic_coherence_demo.rs (line 34)
10fn main() -> Result<(), Box<dyn std::error::Error>> {
11 println!("Topic Coherence Evaluation Demo");
12 println!("==============================\n");
13
14 // Sample documents for topic modeling
15 let documents = vec![
16 "Machine learning algorithms are used in artificial intelligence",
17 "Deep learning neural networks process complex data patterns",
18 "Natural language processing enables text understanding",
19 "Computer vision algorithms detect objects in images",
20 "Reinforcement learning agents learn through trial and error",
21 "Supervised learning requires labeled training data",
22 "Unsupervised learning discovers hidden patterns",
23 "Transfer learning reuses pretrained models",
24 "Statistical models analyze numerical data distributions",
25 "Regression analysis predicts continuous outcomes",
26 "Classification algorithms categorize data points",
27 "Time series analysis forecasts temporal patterns",
28 "Clustering groups similar data together",
29 "Feature engineering improves model performance",
30 "Model validation prevents overfitting",
31 ];
32
33 // Tokenize documents
34 let tokenizer = WhitespaceTokenizer::new();
35 let tokenized_docs: Vec<Vec<String>> = documents
36 .iter()
37 .map(|doc| tokenizer.tokenize(doc).unwrap())
38 .collect();
39
40 // Create a simple vocabulary for demonstration
41 let mut vocabulary = HashMap::new();
42 let mut word_id = 0;
43
44 for doc in &tokenized_docs {
45 for word in doc {
46 if !vocabulary.contains_key(word) {
47 vocabulary.insert(word.clone(), word_id);
48 word_id += 1;
49 }
50 }
51 }
52
53 // Create document-term matrix
54 let n_docs = tokenized_docs.len();
55 let n_words = vocabulary.len();
56 let mut doc_term_matrix = scirs2_core::ndarray::Array2::zeros((n_docs, n_words));
57
58 for (doc_idx, doc) in tokenized_docs.iter().enumerate() {
59 for word in doc {
60 if let Some(&word_id) = vocabulary.get(word) {
61 doc_term_matrix[[doc_idx, word_id]] += 1.0;
62 }
63 }
64 }
65
66 // Train LDA model
67 println!("1. Training LDA Model");
68 println!("--------------------");
69
70 let mut lda = LatentDirichletAllocation::with_ntopics(3);
71 lda.fit(&doc_term_matrix)?;
72
73 // Create reverse vocabulary mapping
74 let id_to_word: HashMap<usize, String> = vocabulary
75 .iter()
76 .map(|(word, &id)| (id, word.clone()))
77 .collect();
78
79 // Get topics
80 let topics = lda.get_topics(5, &id_to_word)?;
81
82 println!("Discovered topics:");
83 for (i, topic) in topics.iter().enumerate() {
84 println!("\nTopic {}: ", i + 1);
85 for (word, prob) in &topic.top_words {
86 println!(" {word} ({prob:.4})");
87 }
88 }
89
90 // Calculate coherence metrics
91 println!("\n2. Topic Coherence Metrics");
92 println!("-------------------------");
93
94 let coherence_calc = TopicCoherence::new().with_window_size(5);
95
96 // C_v coherence
97 let cv_coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
98 println!("C_v coherence: {cv_coherence:.4}");
99
100 // UMass coherence
101 let umass_coherence = coherence_calc.umass_coherence(&topics, &tokenized_docs)?;
102 println!("UMass coherence: {umass_coherence:.4}");
103
104 // UCI coherence
105 let uci_coherence = coherence_calc.uci_coherence(&topics, &tokenized_docs)?;
106 println!("UCI coherence: {uci_coherence:.4}");
107
108 // Topic diversity
109 println!("\n3. Topic Diversity");
110 println!("-----------------");
111
112 let diversity = TopicDiversity::calculate(&topics);
113 println!("Topic diversity: {diversity:.4}");
114
115 // Pairwise distances
116 let distances = TopicDiversity::pairwise_distances(&topics);
117 println!("\nPairwise Jaccard distances between topics:");
118 for i in 0..distances.nrows() {
119 for j in 0..distances.ncols() {
120 print!("{:.3} ", distances[[i, j]]);
121 }
122 println!();
123 }
124
125 // Compare different numbers of topics
126 println!("\n4. Optimal Topic Number Analysis");
127 println!("-------------------------------");
128
129 let topic_counts = vec![2, 3, 4, 5];
130 let mut results = Vec::new();
131
132 for n_topics in topic_counts {
133 let mut lda = LatentDirichletAllocation::with_ntopics(n_topics);
134 lda.fit(&doc_term_matrix)?;
135
136 let topics = lda.get_topics(5, &id_to_word)?;
137 let coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
138 let diversity = TopicDiversity::calculate(&topics);
139
140 results.push((n_topics, coherence, diversity));
141 println!("{n_topics} topics: coherence={coherence:.4}, diversity={diversity:.4}");
142 }
143
144 // Find optimal number of topics
145 let optimal = results
146 .iter()
147 .max_by(|a, b| {
148 // Balance coherence and diversity
149 let score_a = a.1 + 0.5 * a.2;
150 let score_b = b.1 + 0.5 * b.2;
151 score_a.partial_cmp(&score_b).unwrap()
152 })
153 .unwrap();
154
155 println!(
156 "\nOptimal number of topics: {} (coherence={:.4}, diversity={:.4})",
157 optimal.0, optimal.1, optimal.2
158 );
159
160 // Manual topic example
161 println!("\n5. Manual Topic Evaluation");
162 println!("-------------------------");
163
164 let manual_topics = vec![
165 LdaTopic {
166 id: 0,
167 top_words: vec![
168 ("learning".to_string(), 0.15),
169 ("machine".to_string(), 0.12),
170 ("algorithm".to_string(), 0.10),
171 ("data".to_string(), 0.08),
172 ("model".to_string(), 0.07),
173 ],
174 coherence: None,
175 },
176 LdaTopic {
177 id: 1,
178 top_words: vec![
179 ("network".to_string(), 0.14),
180 ("neural".to_string(), 0.13),
181 ("deep".to_string(), 0.11),
182 ("layer".to_string(), 0.09),
183 ("process".to_string(), 0.08),
184 ],
185 coherence: None,
186 },
187 ];
188
189 let manual_coherence = coherence_calc.cv_coherence(&manual_topics, &tokenized_docs)?;
190 let manual_diversity = TopicDiversity::calculate(&manual_topics);
191
192 println!("Manual topics coherence: {manual_coherence:.4}");
193 println!("Manual topics diversity: {manual_diversity:.4}");
194
195 Ok(())
196}Trait Implementations§
Source§impl Clone for WhitespaceTokenizer
impl Clone for WhitespaceTokenizer
Source§fn clone(&self) -> WhitespaceTokenizer
fn clone(&self) -> WhitespaceTokenizer
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for WhitespaceTokenizer
impl Debug for WhitespaceTokenizer
Source§impl Default for WhitespaceTokenizer
impl Default for WhitespaceTokenizer
Auto Trait Implementations§
impl Freeze for WhitespaceTokenizer
impl RefUnwindSafe for WhitespaceTokenizer
impl Send for WhitespaceTokenizer
impl Sync for WhitespaceTokenizer
impl Unpin for WhitespaceTokenizer
impl UnwindSafe for WhitespaceTokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.