organizational_intelligence_plugin/nlp.rs
1//! Natural Language Processing module for commit message analysis.
2//!
3//! This module provides NLP preprocessing utilities for defect classification:
4//! - Tokenization (using aprender's text processing)
5//! - Stop words filtering
6//! - Stemming (Porter stemmer from aprender)
7//! - N-gram generation
8//! - TF-IDF feature extraction (future)
9//!
10//! # Design Principles
11//!
12//! Following Phase 1 of the NLP specification (nlp-models-techniques-spec.md):
13//! - Zero `unwrap()` calls (Cloudflare-class safety)
14//! - Result-based error handling
15//! - Comprehensive test coverage (≥95%)
16//! - Integration with aprender for proven NLP components
17//!
18//! # Examples
19//!
20//! ```rust
21//! use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
22//!
23//! let processor = CommitMessageProcessor::new();
24//! let message = "fix: null pointer dereference in parse_expr()";
25//! let tokens = processor.preprocess(message).unwrap();
26//! // tokens = ["fix", "null", "pointer", "dereference", "parse", "expr"]
27//! ```
28
29use anyhow::{anyhow, Result};
30use aprender::primitives::Matrix;
31use aprender::text::stem::{PorterStemmer, Stemmer};
32use aprender::text::stopwords::StopWordsFilter;
33use aprender::text::tokenize::WordTokenizer;
34use aprender::text::vectorize::TfidfVectorizer;
35use aprender::text::Tokenizer;
36
37/// Commit message preprocessor that applies NLP transformations.
38///
39/// This processor applies a standard NLP pipeline:
40/// 1. Tokenization (word-level with punctuation handling)
41/// 2. Lowercasing
42/// 3. Stop words filtering (with custom software engineering stop words)
43/// 4. Stemming (Porter stemmer)
44///
45/// # Examples
46///
47/// ```rust
48/// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
49///
50/// let processor = CommitMessageProcessor::new();
51/// let message = "fix: race condition in mutex lock";
52/// let tokens = processor.preprocess(message).unwrap();
53/// assert!(tokens.contains(&"race".to_string()));
54/// assert!(tokens.contains(&"condit".to_string())); // Stemmed
55/// ```
56#[derive(Debug, Clone)]
57pub struct CommitMessageProcessor {
58 tokenizer: WordTokenizer,
59 stop_words: StopWordsFilter,
60 stemmer: PorterStemmer,
61}
62
63impl CommitMessageProcessor {
64 /// Create a new commit message processor with default settings.
65 ///
66 /// Uses:
67 /// - WordTokenizer for tokenization
68 /// - English stop words with custom software engineering adjustments
69 /// - Porter stemmer for normalization
70 ///
71 /// # Examples
72 ///
73 /// ```rust
74 /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
75 ///
76 /// let processor = CommitMessageProcessor::new();
77 /// ```
78 pub fn new() -> Self {
79 let tokenizer = WordTokenizer::new();
80
81 // English stop words filter; domain terms (fix, bug, error, memory, etc.)
82 // pass through as they carry semantic weight for defect classification.
83 let stop_words = StopWordsFilter::english();
84
85 let stemmer = PorterStemmer::new();
86
87 Self {
88 tokenizer,
89 stop_words,
90 stemmer,
91 }
92 }
93
94 /// Create a processor with custom stop words.
95 ///
96 /// Useful for domain-specific filtering (e.g., transpiler development).
97 ///
98 /// # Arguments
99 ///
100 /// * `custom_stop_words` - Additional stop words to filter
101 ///
102 /// # Examples
103 ///
104 /// ```rust
105 /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
106 ///
107 /// let processor = CommitMessageProcessor::with_custom_stop_words(vec!["depyler", "internal"]);
108 /// ```
109 pub fn with_custom_stop_words<I, S>(custom_stop_words: I) -> Self
110 where
111 I: IntoIterator<Item = S>,
112 S: AsRef<str>,
113 {
114 let tokenizer = WordTokenizer::new();
115 let stop_words = StopWordsFilter::new(custom_stop_words);
116 let stemmer = PorterStemmer::new();
117
118 Self {
119 tokenizer,
120 stop_words,
121 stemmer,
122 }
123 }
124
125 /// Preprocess a commit message into normalized tokens.
126 ///
127 /// Pipeline:
128 /// 1. Tokenize into words
129 /// 2. Lowercase
130 /// 3. Filter stop words
131 /// 4. Stem to root forms
132 ///
133 /// # Arguments
134 ///
135 /// * `message` - Raw commit message
136 ///
137 /// # Returns
138 ///
139 /// * `Ok(Vec<String>)` - Normalized tokens
140 /// * `Err` - If preprocessing fails
141 ///
142 /// # Examples
143 ///
144 /// ```rust
145 /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
146 ///
147 /// let processor = CommitMessageProcessor::new();
148 /// let tokens = processor.preprocess("fix: memory leak in parser").unwrap();
149 /// assert!(tokens.contains(&"memori".to_string())); // Stemmed "memory"
150 /// assert!(tokens.contains(&"leak".to_string()));
151 /// assert!(tokens.len() >= 2); // At least "memori" and "leak"
152 /// ```
153 pub fn preprocess(&self, message: &str) -> Result<Vec<String>> {
154 // Step 1: Tokenize
155 let tokens = self
156 .tokenizer
157 .tokenize(message)
158 .map_err(|e| anyhow!("Tokenization failed: {}", e))?;
159
160 // Step 2: Lowercase
161 let lowercase_tokens: Vec<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
162
163 // Step 3: Filter stop words
164 let filtered_tokens = self
165 .stop_words
166 .filter(&lowercase_tokens)
167 .map_err(|e| anyhow!("Stop words filtering failed: {}", e))?;
168
169 // Step 4: Stem
170 let stemmed_tokens = self
171 .stemmer
172 .stem_tokens(&filtered_tokens)
173 .map_err(|e| anyhow!("Stemming failed: {}", e))?;
174
175 Ok(stemmed_tokens)
176 }
177
178 /// Extract n-grams from a list of tokens.
179 ///
180 /// N-grams are contiguous sequences of n tokens.
181 /// Useful for detecting multi-word patterns like "null pointer" or "race condition".
182 ///
183 /// # Arguments
184 ///
185 /// * `tokens` - Input tokens
186 /// * `n` - Size of n-grams (1 = unigrams, 2 = bigrams, 3 = trigrams)
187 ///
188 /// # Returns
189 ///
190 /// * `Ok(Vec<String>)` - N-grams joined with underscores
191 /// * `Err` - If n is 0 or greater than token count
192 ///
193 /// # Examples
194 ///
195 /// ```rust
196 /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
197 ///
198 /// let processor = CommitMessageProcessor::new();
199 /// let tokens: Vec<String> = vec![
200 /// "fix".to_string(),
201 /// "race".to_string(),
202 /// "condition".to_string(),
203 /// "mutex".to_string(),
204 /// ];
205 /// let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
206 /// assert!(bigrams.contains(&"fix_race".to_string()));
207 /// assert!(bigrams.contains(&"race_condition".to_string()));
208 /// ```
209 pub fn extract_ngrams(&self, tokens: &[String], n: usize) -> Result<Vec<String>> {
210 if n == 0 {
211 return Err(anyhow!("n must be greater than 0"));
212 }
213
214 if tokens.len() < n {
215 return Ok(Vec::new());
216 }
217
218 let ngrams: Vec<String> = tokens.windows(n).map(|window| window.join("_")).collect();
219
220 Ok(ngrams)
221 }
222
223 /// Preprocess and extract both unigrams and bigrams.
224 ///
225 /// Convenience method that combines preprocessing with n-gram extraction.
226 /// Useful for feature extraction in ML models.
227 ///
228 /// # Arguments
229 ///
230 /// * `message` - Raw commit message
231 ///
232 /// # Returns
233 ///
234 /// * `Ok((Vec<String>, Vec<String>))` - (unigrams, bigrams)
235 /// * `Err` - If preprocessing fails
236 ///
237 /// # Examples
238 ///
239 /// ```rust
240 /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
241 ///
242 /// let processor = CommitMessageProcessor::new();
243 /// let (unigrams, bigrams) = processor.preprocess_with_ngrams("fix: memory leak defect").unwrap();
244 /// assert!(unigrams.contains(&"memori".to_string())); // Stemmed "memory"
245 /// assert!(unigrams.contains(&"leak".to_string()));
246 /// assert!(!bigrams.is_empty()); // Should have bigrams
247 /// ```
248 pub fn preprocess_with_ngrams(&self, message: &str) -> Result<(Vec<String>, Vec<String>)> {
249 let tokens = self.preprocess(message)?;
250 let bigrams = self.extract_ngrams(&tokens, 2)?;
251
252 Ok((tokens, bigrams))
253 }
254}
255
256impl Default for CommitMessageProcessor {
257 fn default() -> Self {
258 Self::new()
259 }
260}
261
262/// TF-IDF feature extractor for commit messages
263///
264/// This extractor converts commit messages into TF-IDF feature vectors for ML classification.
265/// Implements Phase 2 of nlp-models-techniques-spec.md (Tier 2: TF-IDF + ML).
266///
267/// # Examples
268///
269/// ```rust
270/// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
271///
272/// let messages: Vec<String> = vec![
273/// "fix: null pointer dereference".to_string(),
274/// "fix: race condition in mutex".to_string(),
275/// "feat: add new feature".to_string(),
276/// ];
277///
278/// let mut extractor = TfidfFeatureExtractor::new(1500);
279/// let features = extractor.fit_transform(&messages).unwrap();
280///
281/// assert_eq!(features.n_rows(), 3); // 3 documents
282/// ```
283pub struct TfidfFeatureExtractor {
284 vectorizer: TfidfVectorizer,
285 max_features: usize,
286}
287
288impl TfidfFeatureExtractor {
289 /// Create a new TF-IDF feature extractor
290 ///
291 /// # Arguments
292 ///
293 /// * `max_features` - Maximum number of features (vocabulary size)
294 ///
295 /// # Examples
296 ///
297 /// ```rust
298 /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
299 ///
300 /// let extractor = TfidfFeatureExtractor::new(1500);
301 /// ```
302 pub fn new(max_features: usize) -> Self {
303 let vectorizer = TfidfVectorizer::new()
304 .with_tokenizer(Box::new(WordTokenizer::new()))
305 .with_lowercase(true)
306 .with_max_features(max_features);
307
308 Self {
309 vectorizer,
310 max_features,
311 }
312 }
313
314 /// Fit the vectorizer on training messages and transform them to TF-IDF features
315 ///
316 /// # Arguments
317 ///
318 /// * `messages` - Training commit messages
319 ///
320 /// # Returns
321 ///
322 /// * `Ok(Matrix<f64>)` - TF-IDF feature matrix (n_messages × vocabulary_size)
323 /// * `Err` - If vectorization fails
324 ///
325 /// # Examples
326 ///
327 /// ```rust
328 /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
329 ///
330 /// let messages: Vec<String> = vec![
331 /// "fix: memory leak".to_string(),
332 /// "fix: race condition".to_string(),
333 /// ];
334 ///
335 /// let mut extractor = TfidfFeatureExtractor::new(1000);
336 /// let features = extractor.fit_transform(&messages).unwrap();
337 ///
338 /// assert_eq!(features.n_rows(), 2);
339 /// ```
340 pub fn fit_transform(&mut self, messages: &[String]) -> Result<Matrix<f64>> {
341 self.vectorizer
342 .fit_transform(messages)
343 .map_err(|e| anyhow!("TF-IDF fit_transform failed: {}", e))
344 }
345
346 /// Fit the vectorizer on training messages
347 ///
348 /// # Arguments
349 ///
350 /// * `messages` - Training commit messages
351 ///
352 /// # Examples
353 ///
354 /// ```rust
355 /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
356 ///
357 /// let messages = vec![
358 /// "fix: memory leak".to_string(),
359 /// "fix: race condition".to_string(),
360 /// ];
361 ///
362 /// let mut extractor = TfidfFeatureExtractor::new(1000);
363 /// extractor.fit(&messages).unwrap();
364 /// ```
365 pub fn fit(&mut self, messages: &[String]) -> Result<()> {
366 self.vectorizer
367 .fit(messages)
368 .map_err(|e| anyhow!("TF-IDF fit failed: {}", e))
369 }
370
371 /// Transform messages to TF-IDF features using fitted vocabulary
372 ///
373 /// # Arguments
374 ///
375 /// * `messages` - Commit messages to transform
376 ///
377 /// # Returns
378 ///
379 /// * `Ok(Matrix<f64>)` - TF-IDF feature matrix
380 /// * `Err` - If transformation fails
381 ///
382 /// # Examples
383 ///
384 /// ```rust
385 /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
386 ///
387 /// let train_messages = vec![
388 /// "fix: memory leak".to_string(),
389 /// "fix: race condition".to_string(),
390 /// ];
391 ///
392 /// let test_messages = vec!["fix: null pointer".to_string()];
393 ///
394 /// let mut extractor = TfidfFeatureExtractor::new(1000);
395 /// extractor.fit(&train_messages).unwrap();
396 ///
397 /// let features = extractor.transform(&test_messages).unwrap();
398 /// assert_eq!(features.n_rows(), 1);
399 /// ```
400 pub fn transform(&self, messages: &[String]) -> Result<Matrix<f64>> {
401 self.vectorizer
402 .transform(messages)
403 .map_err(|e| anyhow!("TF-IDF transform failed: {}", e))
404 }
405
406 /// Get the vocabulary size (number of features)
407 ///
408 /// # Returns
409 ///
410 /// * `usize` - Number of features in vocabulary
411 ///
412 /// # Examples
413 ///
414 /// ```rust
415 /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
416 ///
417 /// let messages = vec![
418 /// "fix: bug".to_string(),
419 /// "feat: feature".to_string(),
420 /// ];
421 ///
422 /// let mut extractor = TfidfFeatureExtractor::new(1000);
423 /// extractor.fit(&messages).unwrap();
424 ///
425 /// assert!(extractor.vocabulary_size() > 0);
426 /// assert!(extractor.vocabulary_size() <= 1000);
427 /// ```
428 pub fn vocabulary_size(&self) -> usize {
429 self.vectorizer.vocabulary_size()
430 }
431
432 /// Get the maximum features configuration
433 ///
434 /// # Returns
435 ///
436 /// * `usize` - Maximum number of features
437 pub fn max_features(&self) -> usize {
438 self.max_features
439 }
440}
441
442#[cfg(test)]
443mod tests {
444 use super::*;
445
446 #[test]
447 fn test_processor_creation() {
448 let _processor = CommitMessageProcessor::new();
449 let _processor2 = CommitMessageProcessor::default();
450 }
451
452 #[test]
453 fn test_basic_preprocessing() {
454 let processor = CommitMessageProcessor::new();
455 let message = "fix: memory leak detected";
456 let tokens = processor.preprocess(message).unwrap();
457
458 // Should contain key technical terms (stemmed)
459 // "memory" -> "memori" (stemmed), "leak" stays "leak", "detect" -> "detect"
460 assert!(tokens
461 .iter()
462 .any(|t| t.starts_with("memori") || t.starts_with("memory")));
463 assert!(tokens.iter().any(|t| t.starts_with("leak")));
464 assert!(tokens.iter().any(|t| t.starts_with("detect")));
465 }
466
467 #[test]
468 fn test_preprocessing_handles_punctuation() {
469 let processor = CommitMessageProcessor::new();
470 let message = "fix race condition mutex lock";
471 let tokens = processor.preprocess(message).unwrap();
472
473 // Should contain technical terms without punctuation complications
474 assert!(tokens
475 .iter()
476 .any(|t| t.starts_with("race") || t.starts_with("rac")));
477 assert!(tokens
478 .iter()
479 .any(|t| t.starts_with("condit") || t.starts_with("condition")));
480 assert!(tokens.iter().any(|t| t.starts_with("mutex")));
481 assert!(tokens.iter().any(|t| t.starts_with("lock")));
482 }
483
484 #[test]
485 fn test_ngram_extraction() {
486 let processor = CommitMessageProcessor::new();
487 let tokens = vec![
488 "fix".to_string(),
489 "race".to_string(),
490 "condition".to_string(),
491 ];
492
493 let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
494 assert_eq!(bigrams.len(), 2);
495 assert!(bigrams.contains(&"fix_race".to_string()));
496 assert!(bigrams.contains(&"race_condition".to_string()));
497 }
498
499 #[test]
500 fn test_ngram_extraction_trigrams() {
501 let processor = CommitMessageProcessor::new();
502 let tokens = vec![
503 "fix".to_string(),
504 "null".to_string(),
505 "pointer".to_string(),
506 "dereference".to_string(),
507 ];
508
509 let trigrams = processor.extract_ngrams(&tokens, 3).unwrap();
510 assert_eq!(trigrams.len(), 2);
511 assert!(trigrams.contains(&"fix_null_pointer".to_string()));
512 assert!(trigrams.contains(&"null_pointer_dereference".to_string()));
513 }
514
515 #[test]
516 fn test_ngram_empty_tokens() {
517 let processor = CommitMessageProcessor::new();
518 let tokens: Vec<String> = vec![];
519
520 let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
521 assert!(bigrams.is_empty());
522 }
523
524 #[test]
525 fn test_ngram_insufficient_tokens() {
526 let processor = CommitMessageProcessor::new();
527 let tokens = vec!["single".to_string()];
528
529 let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
530 assert!(bigrams.is_empty());
531 }
532
533 #[test]
534 fn test_ngram_zero_n_error() {
535 let processor = CommitMessageProcessor::new();
536 let tokens = vec!["test".to_string()];
537
538 let result = processor.extract_ngrams(&tokens, 0);
539 assert!(result.is_err());
540 }
541
542 #[test]
543 fn test_preprocess_with_ngrams() {
544 let processor = CommitMessageProcessor::new();
545 let message = "fix memory leak in parser";
546
547 let (unigrams, bigrams) = processor.preprocess_with_ngrams(message).unwrap();
548
549 assert!(!unigrams.is_empty());
550 assert!(!bigrams.is_empty());
551 }
552
553 #[test]
554 fn test_custom_stop_words() {
555 let processor = CommitMessageProcessor::with_custom_stop_words(vec!["custom", "stop"]);
556 let message = "custom test stop words";
557
558 let tokens = processor.preprocess(message).unwrap();
559
560 // "custom" and "stop" should be filtered
561 assert!(!tokens.contains(&"custom".to_string()));
562 assert!(!tokens.contains(&"stop".to_string()));
563 // "test" and "words" should remain
564 assert!(tokens.iter().any(|t| t.starts_with("test")));
565 }
566
567 #[test]
568 fn test_preprocessing_with_code_tokens() {
569 let processor = CommitMessageProcessor::new();
570 let message = "fix: parse_expr() null check in into_iter()";
571
572 let tokens = processor.preprocess(message).unwrap();
573
574 // Code identifiers should be tokenized
575 assert!(tokens
576 .iter()
577 .any(|t| t.contains("pars") || t.contains("expr")));
578 assert!(tokens.iter().any(|t| t.contains("null")));
579 }
580
581 #[test]
582 fn test_stemming_normalization() {
583 let processor = CommitMessageProcessor::new();
584 let message1 = "fixing bugs";
585 let message2 = "fixed bug";
586
587 let tokens1 = processor.preprocess(message1).unwrap();
588 let tokens2 = processor.preprocess(message2).unwrap();
589
590 // Both should stem "fix" and "bug" similarly
591 let has_fix_stem1 = tokens1.iter().any(|t| t.starts_with("fix"));
592 let has_fix_stem2 = tokens2.iter().any(|t| t.starts_with("fix"));
593 let has_bug_stem1 = tokens1.iter().any(|t| t.starts_with("bug"));
594 let has_bug_stem2 = tokens2.iter().any(|t| t.starts_with("bug"));
595
596 assert!(has_fix_stem1 || has_fix_stem2);
597 assert!(has_bug_stem1 || has_bug_stem2);
598 }
599
600 #[test]
601 fn test_empty_message() {
602 let processor = CommitMessageProcessor::new();
603 let tokens = processor.preprocess("").unwrap();
604 assert!(tokens.is_empty());
605 }
606
607 #[test]
608 fn test_whitespace_only_message() {
609 let processor = CommitMessageProcessor::new();
610 let tokens = processor.preprocess(" \t\n ").unwrap();
611 assert!(tokens.is_empty());
612 }
613
614 // TF-IDF feature extraction tests
615
616 #[test]
617 fn test_tfidf_extractor_creation() {
618 let extractor = TfidfFeatureExtractor::new(1000);
619 assert_eq!(extractor.max_features(), 1000);
620 }
621
622 #[test]
623 fn test_tfidf_fit_transform_basic() {
624 let messages = vec![
625 "fix: memory leak".to_string(),
626 "fix: race condition".to_string(),
627 "feat: add new feature".to_string(),
628 ];
629
630 let mut extractor = TfidfFeatureExtractor::new(1000);
631 let features = extractor.fit_transform(&messages).unwrap();
632
633 // Should produce matrix with correct dimensions
634 assert_eq!(features.n_rows(), 3); // 3 documents
635 assert!(features.n_cols() > 0); // At least some features
636 assert!(features.n_cols() <= 1000); // Respects max_features
637 }
638
639 #[test]
640 fn test_tfidf_fit_and_transform_separate() {
641 let train_messages = vec![
642 "fix: memory leak".to_string(),
643 "fix: race condition".to_string(),
644 ];
645
646 let test_messages = vec!["fix: null pointer".to_string()];
647
648 let mut extractor = TfidfFeatureExtractor::new(1000);
649
650 // Fit on training data
651 extractor.fit(&train_messages).unwrap();
652
653 // Transform test data
654 let features = extractor.transform(&test_messages).unwrap();
655
656 assert_eq!(features.n_rows(), 1);
657 assert_eq!(features.n_cols(), extractor.vocabulary_size());
658 }
659
660 #[test]
661 fn test_tfidf_vocabulary_size() {
662 let messages = vec![
663 "fix bug".to_string(),
664 "feat feature".to_string(),
665 "test code".to_string(),
666 ];
667
668 let mut extractor = TfidfFeatureExtractor::new(1000);
669 extractor.fit(&messages).unwrap();
670
671 let vocab_size = extractor.vocabulary_size();
672 assert!(vocab_size > 0);
673 assert!(vocab_size <= 1000); // Respects max_features
674 }
675
676 #[test]
677 fn test_tfidf_max_features_limit() {
678 let messages = vec![
679 "word1 word2 word3 word4 word5".to_string(),
680 "word6 word7 word8 word9 word10".to_string(),
681 "word11 word12 word13 word14 word15".to_string(),
682 ];
683
684 // Limit to 5 features
685 let mut extractor = TfidfFeatureExtractor::new(5);
686 extractor.fit(&messages).unwrap();
687
688 assert!(extractor.vocabulary_size() <= 5);
689 }
690
691 #[test]
692 fn test_tfidf_with_real_commit_messages() {
693 let messages = vec![
694 "fix: null pointer dereference in parser".to_string(),
695 "fix: race condition in mutex lock".to_string(),
696 "feat: add TF-IDF feature extraction".to_string(),
697 "docs: update README with examples".to_string(),
698 "test: add unit tests for classifier".to_string(),
699 ];
700
701 let mut extractor = TfidfFeatureExtractor::new(1500);
702 let features = extractor.fit_transform(&messages).unwrap();
703
704 assert_eq!(features.n_rows(), 5);
705 assert!(features.n_cols() > 0);
706
707 // Check that feature values are reasonable (non-negative)
708 for row in 0..features.n_rows() {
709 for col in 0..features.n_cols() {
710 assert!(features.get(row, col) >= 0.0);
711 }
712 }
713 }
714
715 #[test]
716 fn test_tfidf_empty_messages() {
717 let messages: Vec<String> = vec![];
718
719 let mut extractor = TfidfFeatureExtractor::new(1000);
720 let result = extractor.fit_transform(&messages);
721
722 // Should handle empty input gracefully
723 assert!(result.is_ok() || result.is_err());
724 }
725
726 #[test]
727 fn test_tfidf_single_message() {
728 let messages = vec!["fix: single message".to_string()];
729
730 let mut extractor = TfidfFeatureExtractor::new(1000);
731 let features = extractor.fit_transform(&messages).unwrap();
732
733 assert_eq!(features.n_rows(), 1);
734 assert!(features.n_cols() > 0);
735 }
736
737 #[test]
738 fn test_tfidf_duplicate_messages() {
739 let messages = vec![
740 "fix: memory leak".to_string(),
741 "fix: memory leak".to_string(),
742 "fix: memory leak".to_string(),
743 ];
744
745 let mut extractor = TfidfFeatureExtractor::new(1000);
746 let features = extractor.fit_transform(&messages).unwrap();
747
748 assert_eq!(features.n_rows(), 3);
749
750 // Duplicate messages should have similar (but not identical due to IDF) feature vectors
751 // Just verify they transform successfully
752 assert!(features.n_cols() > 0);
753 }
754
755 #[test]
756 fn test_tfidf_transform_new_data() {
757 let train_messages = vec![
758 "fix: memory leak".to_string(),
759 "fix: race condition".to_string(),
760 "feat: new feature".to_string(),
761 ];
762
763 let test_messages = vec![
764 "fix: another memory issue".to_string(),
765 "feat: different feature".to_string(),
766 ];
767
768 let mut extractor = TfidfFeatureExtractor::new(1000);
769 extractor.fit(&train_messages).unwrap();
770
771 let test_features = extractor.transform(&test_messages).unwrap();
772
773 assert_eq!(test_features.n_rows(), 2);
774 assert_eq!(test_features.n_cols(), extractor.vocabulary_size());
775 }
776
777 #[test]
778 fn test_tfidf_configuration() {
779 let extractor = TfidfFeatureExtractor::new(1500);
780
781 assert_eq!(extractor.max_features(), 1500);
782 }
783
784 #[test]
785 fn test_tfidf_with_software_terms() {
786 let messages = vec![
787 "fix: null pointer dereference".to_string(),
788 "fix: buffer overflow in parse_expr".to_string(),
789 "fix: race condition deadlock".to_string(),
790 "fix: memory leak in allocator".to_string(),
791 ];
792
793 let mut extractor = TfidfFeatureExtractor::new(1000);
794 let features = extractor.fit_transform(&messages).unwrap();
795
796 assert_eq!(features.n_rows(), 4);
797
798 // Verify that technical terms are captured
799 // (vocabulary building works correctly)
800 assert!(extractor.vocabulary_size() > 0);
801 }
802
803 #[test]
804 fn test_tfidf_transpiler_specific_terms() {
805 let messages = vec![
806 "fix: operator precedence bug".to_string(),
807 "fix: AST transform error".to_string(),
808 "fix: lifetime parameter issue".to_string(),
809 "fix: trait bound constraint".to_string(),
810 ];
811
812 let mut extractor = TfidfFeatureExtractor::new(1500);
813 let features = extractor.fit_transform(&messages).unwrap();
814
815 assert_eq!(features.n_rows(), 4);
816 assert!(extractor.vocabulary_size() > 0);
817 }
818}