pub struct TopicCoherence { /* private fields */ }Expand description
Topic coherence calculator
Implementations§
Source§impl TopicCoherence
impl TopicCoherence
Sourcepub fn new() -> Self
pub fn new() -> Self
Create a new coherence calculator
Examples found in repository?
examples/topic_coherence_demo.rs (line 94)
10fn main() -> Result<(), Box<dyn std::error::Error>> {
11 println!("Topic Coherence Evaluation Demo");
12 println!("==============================\n");
13
14 // Sample documents for topic modeling
15 let documents = vec![
16 "Machine learning algorithms are used in artificial intelligence",
17 "Deep learning neural networks process complex data patterns",
18 "Natural language processing enables text understanding",
19 "Computer vision algorithms detect objects in images",
20 "Reinforcement learning agents learn through trial and error",
21 "Supervised learning requires labeled training data",
22 "Unsupervised learning discovers hidden patterns",
23 "Transfer learning reuses pretrained models",
24 "Statistical models analyze numerical data distributions",
25 "Regression analysis predicts continuous outcomes",
26 "Classification algorithms categorize data points",
27 "Time series analysis forecasts temporal patterns",
28 "Clustering groups similar data together",
29 "Feature engineering improves model performance",
30 "Model validation prevents overfitting",
31 ];
32
33 // Tokenize documents
34 let tokenizer = WhitespaceTokenizer::new();
35 let tokenized_docs: Vec<Vec<String>> = documents
36 .iter()
37 .map(|doc| tokenizer.tokenize(doc).expect("Operation failed"))
38 .collect();
39
40 // Create a simple vocabulary for demonstration
41 let mut vocabulary = HashMap::new();
42 let mut word_id = 0;
43
44 for doc in &tokenized_docs {
45 for word in doc {
46 if !vocabulary.contains_key(word) {
47 vocabulary.insert(word.clone(), word_id);
48 word_id += 1;
49 }
50 }
51 }
52
53 // Create document-term matrix
54 let n_docs = tokenized_docs.len();
55 let n_words = vocabulary.len();
56 let mut doc_term_matrix = scirs2_core::ndarray::Array2::zeros((n_docs, n_words));
57
58 for (doc_idx, doc) in tokenized_docs.iter().enumerate() {
59 for word in doc {
60 if let Some(&word_id) = vocabulary.get(word) {
61 doc_term_matrix[[doc_idx, word_id]] += 1.0;
62 }
63 }
64 }
65
66 // Train LDA model
67 println!("1. Training LDA Model");
68 println!("--------------------");
69
70 let mut lda = LatentDirichletAllocation::with_ntopics(3);
71 lda.fit(&doc_term_matrix)?;
72
73 // Create reverse vocabulary mapping
74 let id_to_word: HashMap<usize, String> = vocabulary
75 .iter()
76 .map(|(word, &id)| (id, word.clone()))
77 .collect();
78
79 // Get topics
80 let topics = lda.get_topics(5, &id_to_word)?;
81
82 println!("Discovered topics:");
83 for (i, topic) in topics.iter().enumerate() {
84 println!("\nTopic {}: ", i + 1);
85 for (word, prob) in &topic.top_words {
86 println!(" {word} ({prob:.4})");
87 }
88 }
89
90 // Calculate coherence metrics
91 println!("\n2. Topic Coherence Metrics");
92 println!("-------------------------");
93
94 let coherence_calc = TopicCoherence::new().with_window_size(5);
95
96 // C_v coherence
97 let cv_coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
98 println!("C_v coherence: {cv_coherence:.4}");
99
100 // UMass coherence
101 let umass_coherence = coherence_calc.umass_coherence(&topics, &tokenized_docs)?;
102 println!("UMass coherence: {umass_coherence:.4}");
103
104 // UCI coherence
105 let uci_coherence = coherence_calc.uci_coherence(&topics, &tokenized_docs)?;
106 println!("UCI coherence: {uci_coherence:.4}");
107
108 // Topic diversity
109 println!("\n3. Topic Diversity");
110 println!("-----------------");
111
112 let diversity = TopicDiversity::calculate(&topics);
113 println!("Topic diversity: {diversity:.4}");
114
115 // Pairwise distances
116 let distances = TopicDiversity::pairwise_distances(&topics);
117 println!("\nPairwise Jaccard distances between topics:");
118 for i in 0..distances.nrows() {
119 for j in 0..distances.ncols() {
120 print!("{:.3} ", distances[[i, j]]);
121 }
122 println!();
123 }
124
125 // Compare different numbers of topics
126 println!("\n4. Optimal Topic Number Analysis");
127 println!("-------------------------------");
128
129 let topic_counts = vec![2, 3, 4, 5];
130 let mut results = Vec::new();
131
132 for n_topics in topic_counts {
133 let mut lda = LatentDirichletAllocation::with_ntopics(n_topics);
134 lda.fit(&doc_term_matrix)?;
135
136 let topics = lda.get_topics(5, &id_to_word)?;
137 let coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
138 let diversity = TopicDiversity::calculate(&topics);
139
140 results.push((n_topics, coherence, diversity));
141 println!("{n_topics} topics: coherence={coherence:.4}, diversity={diversity:.4}");
142 }
143
144 // Find optimal number of topics
145 let optimal = results
146 .iter()
147 .max_by(|a, b| {
148 // Balance coherence and diversity
149 let score_a = a.1 + 0.5 * a.2;
150 let score_b = b.1 + 0.5 * b.2;
151 score_a.partial_cmp(&score_b).expect("Operation failed")
152 })
153 .expect("Operation failed");
154
155 println!(
156 "\nOptimal number of topics: {} (coherence={:.4}, diversity={:.4})",
157 optimal.0, optimal.1, optimal.2
158 );
159
160 // Manual topic example
161 println!("\n5. Manual Topic Evaluation");
162 println!("-------------------------");
163
164 let manual_topics = vec![
165 LdaTopic {
166 id: 0,
167 top_words: vec![
168 ("learning".to_string(), 0.15),
169 ("machine".to_string(), 0.12),
170 ("algorithm".to_string(), 0.10),
171 ("data".to_string(), 0.08),
172 ("model".to_string(), 0.07),
173 ],
174 coherence: None,
175 },
176 LdaTopic {
177 id: 1,
178 top_words: vec![
179 ("network".to_string(), 0.14),
180 ("neural".to_string(), 0.13),
181 ("deep".to_string(), 0.11),
182 ("layer".to_string(), 0.09),
183 ("process".to_string(), 0.08),
184 ],
185 coherence: None,
186 },
187 ];
188
189 let manual_coherence = coherence_calc.cv_coherence(&manual_topics, &tokenized_docs)?;
190 let manual_diversity = TopicDiversity::calculate(&manual_topics);
191
192 println!("Manual topics coherence: {manual_coherence:.4}");
193 println!("Manual topics diversity: {manual_diversity:.4}");
194
195 Ok(())
196}Sourcepub fn with_window_size(self, windowsize: usize) -> Self
pub fn with_window_size(self, windowsize: usize) -> Self
Set window size for co-occurrence
Examples found in repository?
examples/topic_coherence_demo.rs (line 94)
10fn main() -> Result<(), Box<dyn std::error::Error>> {
11 println!("Topic Coherence Evaluation Demo");
12 println!("==============================\n");
13
14 // Sample documents for topic modeling
15 let documents = vec![
16 "Machine learning algorithms are used in artificial intelligence",
17 "Deep learning neural networks process complex data patterns",
18 "Natural language processing enables text understanding",
19 "Computer vision algorithms detect objects in images",
20 "Reinforcement learning agents learn through trial and error",
21 "Supervised learning requires labeled training data",
22 "Unsupervised learning discovers hidden patterns",
23 "Transfer learning reuses pretrained models",
24 "Statistical models analyze numerical data distributions",
25 "Regression analysis predicts continuous outcomes",
26 "Classification algorithms categorize data points",
27 "Time series analysis forecasts temporal patterns",
28 "Clustering groups similar data together",
29 "Feature engineering improves model performance",
30 "Model validation prevents overfitting",
31 ];
32
33 // Tokenize documents
34 let tokenizer = WhitespaceTokenizer::new();
35 let tokenized_docs: Vec<Vec<String>> = documents
36 .iter()
37 .map(|doc| tokenizer.tokenize(doc).expect("Operation failed"))
38 .collect();
39
40 // Create a simple vocabulary for demonstration
41 let mut vocabulary = HashMap::new();
42 let mut word_id = 0;
43
44 for doc in &tokenized_docs {
45 for word in doc {
46 if !vocabulary.contains_key(word) {
47 vocabulary.insert(word.clone(), word_id);
48 word_id += 1;
49 }
50 }
51 }
52
53 // Create document-term matrix
54 let n_docs = tokenized_docs.len();
55 let n_words = vocabulary.len();
56 let mut doc_term_matrix = scirs2_core::ndarray::Array2::zeros((n_docs, n_words));
57
58 for (doc_idx, doc) in tokenized_docs.iter().enumerate() {
59 for word in doc {
60 if let Some(&word_id) = vocabulary.get(word) {
61 doc_term_matrix[[doc_idx, word_id]] += 1.0;
62 }
63 }
64 }
65
66 // Train LDA model
67 println!("1. Training LDA Model");
68 println!("--------------------");
69
70 let mut lda = LatentDirichletAllocation::with_ntopics(3);
71 lda.fit(&doc_term_matrix)?;
72
73 // Create reverse vocabulary mapping
74 let id_to_word: HashMap<usize, String> = vocabulary
75 .iter()
76 .map(|(word, &id)| (id, word.clone()))
77 .collect();
78
79 // Get topics
80 let topics = lda.get_topics(5, &id_to_word)?;
81
82 println!("Discovered topics:");
83 for (i, topic) in topics.iter().enumerate() {
84 println!("\nTopic {}: ", i + 1);
85 for (word, prob) in &topic.top_words {
86 println!(" {word} ({prob:.4})");
87 }
88 }
89
90 // Calculate coherence metrics
91 println!("\n2. Topic Coherence Metrics");
92 println!("-------------------------");
93
94 let coherence_calc = TopicCoherence::new().with_window_size(5);
95
96 // C_v coherence
97 let cv_coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
98 println!("C_v coherence: {cv_coherence:.4}");
99
100 // UMass coherence
101 let umass_coherence = coherence_calc.umass_coherence(&topics, &tokenized_docs)?;
102 println!("UMass coherence: {umass_coherence:.4}");
103
104 // UCI coherence
105 let uci_coherence = coherence_calc.uci_coherence(&topics, &tokenized_docs)?;
106 println!("UCI coherence: {uci_coherence:.4}");
107
108 // Topic diversity
109 println!("\n3. Topic Diversity");
110 println!("-----------------");
111
112 let diversity = TopicDiversity::calculate(&topics);
113 println!("Topic diversity: {diversity:.4}");
114
115 // Pairwise distances
116 let distances = TopicDiversity::pairwise_distances(&topics);
117 println!("\nPairwise Jaccard distances between topics:");
118 for i in 0..distances.nrows() {
119 for j in 0..distances.ncols() {
120 print!("{:.3} ", distances[[i, j]]);
121 }
122 println!();
123 }
124
125 // Compare different numbers of topics
126 println!("\n4. Optimal Topic Number Analysis");
127 println!("-------------------------------");
128
129 let topic_counts = vec![2, 3, 4, 5];
130 let mut results = Vec::new();
131
132 for n_topics in topic_counts {
133 let mut lda = LatentDirichletAllocation::with_ntopics(n_topics);
134 lda.fit(&doc_term_matrix)?;
135
136 let topics = lda.get_topics(5, &id_to_word)?;
137 let coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
138 let diversity = TopicDiversity::calculate(&topics);
139
140 results.push((n_topics, coherence, diversity));
141 println!("{n_topics} topics: coherence={coherence:.4}, diversity={diversity:.4}");
142 }
143
144 // Find optimal number of topics
145 let optimal = results
146 .iter()
147 .max_by(|a, b| {
148 // Balance coherence and diversity
149 let score_a = a.1 + 0.5 * a.2;
150 let score_b = b.1 + 0.5 * b.2;
151 score_a.partial_cmp(&score_b).expect("Operation failed")
152 })
153 .expect("Operation failed");
154
155 println!(
156 "\nOptimal number of topics: {} (coherence={:.4}, diversity={:.4})",
157 optimal.0, optimal.1, optimal.2
158 );
159
160 // Manual topic example
161 println!("\n5. Manual Topic Evaluation");
162 println!("-------------------------");
163
164 let manual_topics = vec![
165 LdaTopic {
166 id: 0,
167 top_words: vec![
168 ("learning".to_string(), 0.15),
169 ("machine".to_string(), 0.12),
170 ("algorithm".to_string(), 0.10),
171 ("data".to_string(), 0.08),
172 ("model".to_string(), 0.07),
173 ],
174 coherence: None,
175 },
176 LdaTopic {
177 id: 1,
178 top_words: vec![
179 ("network".to_string(), 0.14),
180 ("neural".to_string(), 0.13),
181 ("deep".to_string(), 0.11),
182 ("layer".to_string(), 0.09),
183 ("process".to_string(), 0.08),
184 ],
185 coherence: None,
186 },
187 ];
188
189 let manual_coherence = coherence_calc.cv_coherence(&manual_topics, &tokenized_docs)?;
190 let manual_diversity = TopicDiversity::calculate(&manual_topics);
191
192 println!("Manual topics coherence: {manual_coherence:.4}");
193 println!("Manual topics diversity: {manual_diversity:.4}");
194
195 Ok(())
196}Sourcepub fn cv_coherence(
&self,
topics: &[Topic],
documents: &[Vec<String>],
) -> Result<f64>
pub fn cv_coherence( &self, topics: &[Topic], documents: &[Vec<String>], ) -> Result<f64>
Calculate C_v coherence (Röder et al., 2015)
Examples found in repository?
examples/topic_coherence_demo.rs (line 97)
10fn main() -> Result<(), Box<dyn std::error::Error>> {
11 println!("Topic Coherence Evaluation Demo");
12 println!("==============================\n");
13
14 // Sample documents for topic modeling
15 let documents = vec![
16 "Machine learning algorithms are used in artificial intelligence",
17 "Deep learning neural networks process complex data patterns",
18 "Natural language processing enables text understanding",
19 "Computer vision algorithms detect objects in images",
20 "Reinforcement learning agents learn through trial and error",
21 "Supervised learning requires labeled training data",
22 "Unsupervised learning discovers hidden patterns",
23 "Transfer learning reuses pretrained models",
24 "Statistical models analyze numerical data distributions",
25 "Regression analysis predicts continuous outcomes",
26 "Classification algorithms categorize data points",
27 "Time series analysis forecasts temporal patterns",
28 "Clustering groups similar data together",
29 "Feature engineering improves model performance",
30 "Model validation prevents overfitting",
31 ];
32
33 // Tokenize documents
34 let tokenizer = WhitespaceTokenizer::new();
35 let tokenized_docs: Vec<Vec<String>> = documents
36 .iter()
37 .map(|doc| tokenizer.tokenize(doc).expect("Operation failed"))
38 .collect();
39
40 // Create a simple vocabulary for demonstration
41 let mut vocabulary = HashMap::new();
42 let mut word_id = 0;
43
44 for doc in &tokenized_docs {
45 for word in doc {
46 if !vocabulary.contains_key(word) {
47 vocabulary.insert(word.clone(), word_id);
48 word_id += 1;
49 }
50 }
51 }
52
53 // Create document-term matrix
54 let n_docs = tokenized_docs.len();
55 let n_words = vocabulary.len();
56 let mut doc_term_matrix = scirs2_core::ndarray::Array2::zeros((n_docs, n_words));
57
58 for (doc_idx, doc) in tokenized_docs.iter().enumerate() {
59 for word in doc {
60 if let Some(&word_id) = vocabulary.get(word) {
61 doc_term_matrix[[doc_idx, word_id]] += 1.0;
62 }
63 }
64 }
65
66 // Train LDA model
67 println!("1. Training LDA Model");
68 println!("--------------------");
69
70 let mut lda = LatentDirichletAllocation::with_ntopics(3);
71 lda.fit(&doc_term_matrix)?;
72
73 // Create reverse vocabulary mapping
74 let id_to_word: HashMap<usize, String> = vocabulary
75 .iter()
76 .map(|(word, &id)| (id, word.clone()))
77 .collect();
78
79 // Get topics
80 let topics = lda.get_topics(5, &id_to_word)?;
81
82 println!("Discovered topics:");
83 for (i, topic) in topics.iter().enumerate() {
84 println!("\nTopic {}: ", i + 1);
85 for (word, prob) in &topic.top_words {
86 println!(" {word} ({prob:.4})");
87 }
88 }
89
90 // Calculate coherence metrics
91 println!("\n2. Topic Coherence Metrics");
92 println!("-------------------------");
93
94 let coherence_calc = TopicCoherence::new().with_window_size(5);
95
96 // C_v coherence
97 let cv_coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
98 println!("C_v coherence: {cv_coherence:.4}");
99
100 // UMass coherence
101 let umass_coherence = coherence_calc.umass_coherence(&topics, &tokenized_docs)?;
102 println!("UMass coherence: {umass_coherence:.4}");
103
104 // UCI coherence
105 let uci_coherence = coherence_calc.uci_coherence(&topics, &tokenized_docs)?;
106 println!("UCI coherence: {uci_coherence:.4}");
107
108 // Topic diversity
109 println!("\n3. Topic Diversity");
110 println!("-----------------");
111
112 let diversity = TopicDiversity::calculate(&topics);
113 println!("Topic diversity: {diversity:.4}");
114
115 // Pairwise distances
116 let distances = TopicDiversity::pairwise_distances(&topics);
117 println!("\nPairwise Jaccard distances between topics:");
118 for i in 0..distances.nrows() {
119 for j in 0..distances.ncols() {
120 print!("{:.3} ", distances[[i, j]]);
121 }
122 println!();
123 }
124
125 // Compare different numbers of topics
126 println!("\n4. Optimal Topic Number Analysis");
127 println!("-------------------------------");
128
129 let topic_counts = vec![2, 3, 4, 5];
130 let mut results = Vec::new();
131
132 for n_topics in topic_counts {
133 let mut lda = LatentDirichletAllocation::with_ntopics(n_topics);
134 lda.fit(&doc_term_matrix)?;
135
136 let topics = lda.get_topics(5, &id_to_word)?;
137 let coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
138 let diversity = TopicDiversity::calculate(&topics);
139
140 results.push((n_topics, coherence, diversity));
141 println!("{n_topics} topics: coherence={coherence:.4}, diversity={diversity:.4}");
142 }
143
144 // Find optimal number of topics
145 let optimal = results
146 .iter()
147 .max_by(|a, b| {
148 // Balance coherence and diversity
149 let score_a = a.1 + 0.5 * a.2;
150 let score_b = b.1 + 0.5 * b.2;
151 score_a.partial_cmp(&score_b).expect("Operation failed")
152 })
153 .expect("Operation failed");
154
155 println!(
156 "\nOptimal number of topics: {} (coherence={:.4}, diversity={:.4})",
157 optimal.0, optimal.1, optimal.2
158 );
159
160 // Manual topic example
161 println!("\n5. Manual Topic Evaluation");
162 println!("-------------------------");
163
164 let manual_topics = vec![
165 LdaTopic {
166 id: 0,
167 top_words: vec![
168 ("learning".to_string(), 0.15),
169 ("machine".to_string(), 0.12),
170 ("algorithm".to_string(), 0.10),
171 ("data".to_string(), 0.08),
172 ("model".to_string(), 0.07),
173 ],
174 coherence: None,
175 },
176 LdaTopic {
177 id: 1,
178 top_words: vec![
179 ("network".to_string(), 0.14),
180 ("neural".to_string(), 0.13),
181 ("deep".to_string(), 0.11),
182 ("layer".to_string(), 0.09),
183 ("process".to_string(), 0.08),
184 ],
185 coherence: None,
186 },
187 ];
188
189 let manual_coherence = coherence_calc.cv_coherence(&manual_topics, &tokenized_docs)?;
190 let manual_diversity = TopicDiversity::calculate(&manual_topics);
191
192 println!("Manual topics coherence: {manual_coherence:.4}");
193 println!("Manual topics diversity: {manual_diversity:.4}");
194
195 Ok(())
196}Sourcepub fn umass_coherence(
&self,
topics: &[Topic],
documents: &[Vec<String>],
) -> Result<f64>
pub fn umass_coherence( &self, topics: &[Topic], documents: &[Vec<String>], ) -> Result<f64>
Calculate UMass coherence
Examples found in repository?
examples/topic_coherence_demo.rs (line 101)
10fn main() -> Result<(), Box<dyn std::error::Error>> {
11 println!("Topic Coherence Evaluation Demo");
12 println!("==============================\n");
13
14 // Sample documents for topic modeling
15 let documents = vec![
16 "Machine learning algorithms are used in artificial intelligence",
17 "Deep learning neural networks process complex data patterns",
18 "Natural language processing enables text understanding",
19 "Computer vision algorithms detect objects in images",
20 "Reinforcement learning agents learn through trial and error",
21 "Supervised learning requires labeled training data",
22 "Unsupervised learning discovers hidden patterns",
23 "Transfer learning reuses pretrained models",
24 "Statistical models analyze numerical data distributions",
25 "Regression analysis predicts continuous outcomes",
26 "Classification algorithms categorize data points",
27 "Time series analysis forecasts temporal patterns",
28 "Clustering groups similar data together",
29 "Feature engineering improves model performance",
30 "Model validation prevents overfitting",
31 ];
32
33 // Tokenize documents
34 let tokenizer = WhitespaceTokenizer::new();
35 let tokenized_docs: Vec<Vec<String>> = documents
36 .iter()
37 .map(|doc| tokenizer.tokenize(doc).expect("Operation failed"))
38 .collect();
39
40 // Create a simple vocabulary for demonstration
41 let mut vocabulary = HashMap::new();
42 let mut word_id = 0;
43
44 for doc in &tokenized_docs {
45 for word in doc {
46 if !vocabulary.contains_key(word) {
47 vocabulary.insert(word.clone(), word_id);
48 word_id += 1;
49 }
50 }
51 }
52
53 // Create document-term matrix
54 let n_docs = tokenized_docs.len();
55 let n_words = vocabulary.len();
56 let mut doc_term_matrix = scirs2_core::ndarray::Array2::zeros((n_docs, n_words));
57
58 for (doc_idx, doc) in tokenized_docs.iter().enumerate() {
59 for word in doc {
60 if let Some(&word_id) = vocabulary.get(word) {
61 doc_term_matrix[[doc_idx, word_id]] += 1.0;
62 }
63 }
64 }
65
66 // Train LDA model
67 println!("1. Training LDA Model");
68 println!("--------------------");
69
70 let mut lda = LatentDirichletAllocation::with_ntopics(3);
71 lda.fit(&doc_term_matrix)?;
72
73 // Create reverse vocabulary mapping
74 let id_to_word: HashMap<usize, String> = vocabulary
75 .iter()
76 .map(|(word, &id)| (id, word.clone()))
77 .collect();
78
79 // Get topics
80 let topics = lda.get_topics(5, &id_to_word)?;
81
82 println!("Discovered topics:");
83 for (i, topic) in topics.iter().enumerate() {
84 println!("\nTopic {}: ", i + 1);
85 for (word, prob) in &topic.top_words {
86 println!(" {word} ({prob:.4})");
87 }
88 }
89
90 // Calculate coherence metrics
91 println!("\n2. Topic Coherence Metrics");
92 println!("-------------------------");
93
94 let coherence_calc = TopicCoherence::new().with_window_size(5);
95
96 // C_v coherence
97 let cv_coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
98 println!("C_v coherence: {cv_coherence:.4}");
99
100 // UMass coherence
101 let umass_coherence = coherence_calc.umass_coherence(&topics, &tokenized_docs)?;
102 println!("UMass coherence: {umass_coherence:.4}");
103
104 // UCI coherence
105 let uci_coherence = coherence_calc.uci_coherence(&topics, &tokenized_docs)?;
106 println!("UCI coherence: {uci_coherence:.4}");
107
108 // Topic diversity
109 println!("\n3. Topic Diversity");
110 println!("-----------------");
111
112 let diversity = TopicDiversity::calculate(&topics);
113 println!("Topic diversity: {diversity:.4}");
114
115 // Pairwise distances
116 let distances = TopicDiversity::pairwise_distances(&topics);
117 println!("\nPairwise Jaccard distances between topics:");
118 for i in 0..distances.nrows() {
119 for j in 0..distances.ncols() {
120 print!("{:.3} ", distances[[i, j]]);
121 }
122 println!();
123 }
124
125 // Compare different numbers of topics
126 println!("\n4. Optimal Topic Number Analysis");
127 println!("-------------------------------");
128
129 let topic_counts = vec![2, 3, 4, 5];
130 let mut results = Vec::new();
131
132 for n_topics in topic_counts {
133 let mut lda = LatentDirichletAllocation::with_ntopics(n_topics);
134 lda.fit(&doc_term_matrix)?;
135
136 let topics = lda.get_topics(5, &id_to_word)?;
137 let coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
138 let diversity = TopicDiversity::calculate(&topics);
139
140 results.push((n_topics, coherence, diversity));
141 println!("{n_topics} topics: coherence={coherence:.4}, diversity={diversity:.4}");
142 }
143
144 // Find optimal number of topics
145 let optimal = results
146 .iter()
147 .max_by(|a, b| {
148 // Balance coherence and diversity
149 let score_a = a.1 + 0.5 * a.2;
150 let score_b = b.1 + 0.5 * b.2;
151 score_a.partial_cmp(&score_b).expect("Operation failed")
152 })
153 .expect("Operation failed");
154
155 println!(
156 "\nOptimal number of topics: {} (coherence={:.4}, diversity={:.4})",
157 optimal.0, optimal.1, optimal.2
158 );
159
160 // Manual topic example
161 println!("\n5. Manual Topic Evaluation");
162 println!("-------------------------");
163
164 let manual_topics = vec![
165 LdaTopic {
166 id: 0,
167 top_words: vec![
168 ("learning".to_string(), 0.15),
169 ("machine".to_string(), 0.12),
170 ("algorithm".to_string(), 0.10),
171 ("data".to_string(), 0.08),
172 ("model".to_string(), 0.07),
173 ],
174 coherence: None,
175 },
176 LdaTopic {
177 id: 1,
178 top_words: vec![
179 ("network".to_string(), 0.14),
180 ("neural".to_string(), 0.13),
181 ("deep".to_string(), 0.11),
182 ("layer".to_string(), 0.09),
183 ("process".to_string(), 0.08),
184 ],
185 coherence: None,
186 },
187 ];
188
189 let manual_coherence = coherence_calc.cv_coherence(&manual_topics, &tokenized_docs)?;
190 let manual_diversity = TopicDiversity::calculate(&manual_topics);
191
192 println!("Manual topics coherence: {manual_coherence:.4}");
193 println!("Manual topics diversity: {manual_diversity:.4}");
194
195 Ok(())
196}Sourcepub fn uci_coherence(
&self,
topics: &[Topic],
documents: &[Vec<String>],
) -> Result<f64>
pub fn uci_coherence( &self, topics: &[Topic], documents: &[Vec<String>], ) -> Result<f64>
Calculate UCI coherence
Examples found in repository?
examples/topic_coherence_demo.rs (line 105)
10fn main() -> Result<(), Box<dyn std::error::Error>> {
11 println!("Topic Coherence Evaluation Demo");
12 println!("==============================\n");
13
14 // Sample documents for topic modeling
15 let documents = vec![
16 "Machine learning algorithms are used in artificial intelligence",
17 "Deep learning neural networks process complex data patterns",
18 "Natural language processing enables text understanding",
19 "Computer vision algorithms detect objects in images",
20 "Reinforcement learning agents learn through trial and error",
21 "Supervised learning requires labeled training data",
22 "Unsupervised learning discovers hidden patterns",
23 "Transfer learning reuses pretrained models",
24 "Statistical models analyze numerical data distributions",
25 "Regression analysis predicts continuous outcomes",
26 "Classification algorithms categorize data points",
27 "Time series analysis forecasts temporal patterns",
28 "Clustering groups similar data together",
29 "Feature engineering improves model performance",
30 "Model validation prevents overfitting",
31 ];
32
33 // Tokenize documents
34 let tokenizer = WhitespaceTokenizer::new();
35 let tokenized_docs: Vec<Vec<String>> = documents
36 .iter()
37 .map(|doc| tokenizer.tokenize(doc).expect("Operation failed"))
38 .collect();
39
40 // Create a simple vocabulary for demonstration
41 let mut vocabulary = HashMap::new();
42 let mut word_id = 0;
43
44 for doc in &tokenized_docs {
45 for word in doc {
46 if !vocabulary.contains_key(word) {
47 vocabulary.insert(word.clone(), word_id);
48 word_id += 1;
49 }
50 }
51 }
52
53 // Create document-term matrix
54 let n_docs = tokenized_docs.len();
55 let n_words = vocabulary.len();
56 let mut doc_term_matrix = scirs2_core::ndarray::Array2::zeros((n_docs, n_words));
57
58 for (doc_idx, doc) in tokenized_docs.iter().enumerate() {
59 for word in doc {
60 if let Some(&word_id) = vocabulary.get(word) {
61 doc_term_matrix[[doc_idx, word_id]] += 1.0;
62 }
63 }
64 }
65
66 // Train LDA model
67 println!("1. Training LDA Model");
68 println!("--------------------");
69
70 let mut lda = LatentDirichletAllocation::with_ntopics(3);
71 lda.fit(&doc_term_matrix)?;
72
73 // Create reverse vocabulary mapping
74 let id_to_word: HashMap<usize, String> = vocabulary
75 .iter()
76 .map(|(word, &id)| (id, word.clone()))
77 .collect();
78
79 // Get topics
80 let topics = lda.get_topics(5, &id_to_word)?;
81
82 println!("Discovered topics:");
83 for (i, topic) in topics.iter().enumerate() {
84 println!("\nTopic {}: ", i + 1);
85 for (word, prob) in &topic.top_words {
86 println!(" {word} ({prob:.4})");
87 }
88 }
89
90 // Calculate coherence metrics
91 println!("\n2. Topic Coherence Metrics");
92 println!("-------------------------");
93
94 let coherence_calc = TopicCoherence::new().with_window_size(5);
95
96 // C_v coherence
97 let cv_coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
98 println!("C_v coherence: {cv_coherence:.4}");
99
100 // UMass coherence
101 let umass_coherence = coherence_calc.umass_coherence(&topics, &tokenized_docs)?;
102 println!("UMass coherence: {umass_coherence:.4}");
103
104 // UCI coherence
105 let uci_coherence = coherence_calc.uci_coherence(&topics, &tokenized_docs)?;
106 println!("UCI coherence: {uci_coherence:.4}");
107
108 // Topic diversity
109 println!("\n3. Topic Diversity");
110 println!("-----------------");
111
112 let diversity = TopicDiversity::calculate(&topics);
113 println!("Topic diversity: {diversity:.4}");
114
115 // Pairwise distances
116 let distances = TopicDiversity::pairwise_distances(&topics);
117 println!("\nPairwise Jaccard distances between topics:");
118 for i in 0..distances.nrows() {
119 for j in 0..distances.ncols() {
120 print!("{:.3} ", distances[[i, j]]);
121 }
122 println!();
123 }
124
125 // Compare different numbers of topics
126 println!("\n4. Optimal Topic Number Analysis");
127 println!("-------------------------------");
128
129 let topic_counts = vec![2, 3, 4, 5];
130 let mut results = Vec::new();
131
132 for n_topics in topic_counts {
133 let mut lda = LatentDirichletAllocation::with_ntopics(n_topics);
134 lda.fit(&doc_term_matrix)?;
135
136 let topics = lda.get_topics(5, &id_to_word)?;
137 let coherence = coherence_calc.cv_coherence(&topics, &tokenized_docs)?;
138 let diversity = TopicDiversity::calculate(&topics);
139
140 results.push((n_topics, coherence, diversity));
141 println!("{n_topics} topics: coherence={coherence:.4}, diversity={diversity:.4}");
142 }
143
144 // Find optimal number of topics
145 let optimal = results
146 .iter()
147 .max_by(|a, b| {
148 // Balance coherence and diversity
149 let score_a = a.1 + 0.5 * a.2;
150 let score_b = b.1 + 0.5 * b.2;
151 score_a.partial_cmp(&score_b).expect("Operation failed")
152 })
153 .expect("Operation failed");
154
155 println!(
156 "\nOptimal number of topics: {} (coherence={:.4}, diversity={:.4})",
157 optimal.0, optimal.1, optimal.2
158 );
159
160 // Manual topic example
161 println!("\n5. Manual Topic Evaluation");
162 println!("-------------------------");
163
164 let manual_topics = vec![
165 LdaTopic {
166 id: 0,
167 top_words: vec![
168 ("learning".to_string(), 0.15),
169 ("machine".to_string(), 0.12),
170 ("algorithm".to_string(), 0.10),
171 ("data".to_string(), 0.08),
172 ("model".to_string(), 0.07),
173 ],
174 coherence: None,
175 },
176 LdaTopic {
177 id: 1,
178 top_words: vec![
179 ("network".to_string(), 0.14),
180 ("neural".to_string(), 0.13),
181 ("deep".to_string(), 0.11),
182 ("layer".to_string(), 0.09),
183 ("process".to_string(), 0.08),
184 ],
185 coherence: None,
186 },
187 ];
188
189 let manual_coherence = coherence_calc.cv_coherence(&manual_topics, &tokenized_docs)?;
190 let manual_diversity = TopicDiversity::calculate(&manual_topics);
191
192 println!("Manual topics coherence: {manual_coherence:.4}");
193 println!("Manual topics diversity: {manual_diversity:.4}");
194
195 Ok(())
196}Trait Implementations§
Auto Trait Implementations§
impl Freeze for TopicCoherence
impl RefUnwindSafe for TopicCoherence
impl Send for TopicCoherence
impl Sync for TopicCoherence
impl Unpin for TopicCoherence
impl UnwindSafe for TopicCoherence
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.