organizational_intelligence_plugin/
nlp.rs

1//! Natural Language Processing module for commit message analysis.
2//!
3//! This module provides NLP preprocessing utilities for defect classification:
4//! - Tokenization (using aprender's text processing)
5//! - Stop words filtering
6//! - Stemming (Porter stemmer from aprender)
7//! - N-gram generation
8//! - TF-IDF feature extraction (future)
9//!
10//! # Design Principles
11//!
12//! Following Phase 1 of the NLP specification (nlp-models-techniques-spec.md):
13//! - Zero `unwrap()` calls (Cloudflare-class safety)
14//! - Result-based error handling
15//! - Comprehensive test coverage (≥95%)
16//! - Integration with aprender for proven NLP components
17//!
18//! # Examples
19//!
20//! ```rust
21//! use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
22//!
23//! let processor = CommitMessageProcessor::new();
24//! let message = "fix: null pointer dereference in parse_expr()";
25//! let tokens = processor.preprocess(message).unwrap();
26//! // tokens = ["fix", "null", "pointer", "dereference", "parse", "expr"]
27//! ```
28
29use anyhow::{anyhow, Result};
30use aprender::primitives::Matrix;
31use aprender::text::stem::{PorterStemmer, Stemmer};
32use aprender::text::stopwords::StopWordsFilter;
33use aprender::text::tokenize::WordTokenizer;
34use aprender::text::vectorize::TfidfVectorizer;
35use aprender::text::Tokenizer;
36
37/// Commit message preprocessor that applies NLP transformations.
38///
39/// This processor applies a standard NLP pipeline:
40/// 1. Tokenization (word-level with punctuation handling)
41/// 2. Lowercasing
42/// 3. Stop words filtering (with custom software engineering stop words)
43/// 4. Stemming (Porter stemmer)
44///
45/// # Examples
46///
47/// ```rust
48/// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
49///
50/// let processor = CommitMessageProcessor::new();
51/// let message = "fix: race condition in mutex lock";
52/// let tokens = processor.preprocess(message).unwrap();
53/// assert!(tokens.contains(&"race".to_string()));
54/// assert!(tokens.contains(&"condit".to_string())); // Stemmed
55/// ```
56#[derive(Debug, Clone)]
57pub struct CommitMessageProcessor {
58    tokenizer: WordTokenizer,
59    stop_words: StopWordsFilter,
60    stemmer: PorterStemmer,
61}
62
63impl CommitMessageProcessor {
64    /// Create a new commit message processor with default settings.
65    ///
66    /// Uses:
67    /// - WordTokenizer for tokenization
68    /// - English stop words with custom software engineering adjustments
69    /// - Porter stemmer for normalization
70    ///
71    /// # Examples
72    ///
73    /// ```rust
74    /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
75    ///
76    /// let processor = CommitMessageProcessor::new();
77    /// ```
78    pub fn new() -> Self {
79        let tokenizer = WordTokenizer::new();
80
81        // English stop words filter; domain terms (fix, bug, error, memory, etc.)
82        // pass through as they carry semantic weight for defect classification.
83        let stop_words = StopWordsFilter::english();
84
85        let stemmer = PorterStemmer::new();
86
87        Self {
88            tokenizer,
89            stop_words,
90            stemmer,
91        }
92    }
93
94    /// Create a processor with custom stop words.
95    ///
96    /// Useful for domain-specific filtering (e.g., transpiler development).
97    ///
98    /// # Arguments
99    ///
100    /// * `custom_stop_words` - Additional stop words to filter
101    ///
102    /// # Examples
103    ///
104    /// ```rust
105    /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
106    ///
107    /// let processor = CommitMessageProcessor::with_custom_stop_words(vec!["depyler", "internal"]);
108    /// ```
109    pub fn with_custom_stop_words<I, S>(custom_stop_words: I) -> Self
110    where
111        I: IntoIterator<Item = S>,
112        S: AsRef<str>,
113    {
114        let tokenizer = WordTokenizer::new();
115        let stop_words = StopWordsFilter::new(custom_stop_words);
116        let stemmer = PorterStemmer::new();
117
118        Self {
119            tokenizer,
120            stop_words,
121            stemmer,
122        }
123    }
124
125    /// Preprocess a commit message into normalized tokens.
126    ///
127    /// Pipeline:
128    /// 1. Tokenize into words
129    /// 2. Lowercase
130    /// 3. Filter stop words
131    /// 4. Stem to root forms
132    ///
133    /// # Arguments
134    ///
135    /// * `message` - Raw commit message
136    ///
137    /// # Returns
138    ///
139    /// * `Ok(Vec<String>)` - Normalized tokens
140    /// * `Err` - If preprocessing fails
141    ///
142    /// # Examples
143    ///
144    /// ```rust
145    /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
146    ///
147    /// let processor = CommitMessageProcessor::new();
148    /// let tokens = processor.preprocess("fix: memory leak in parser").unwrap();
149    /// assert!(tokens.contains(&"memori".to_string())); // Stemmed "memory"
150    /// assert!(tokens.contains(&"leak".to_string()));
151    /// assert!(tokens.len() >= 2); // At least "memori" and "leak"
152    /// ```
153    pub fn preprocess(&self, message: &str) -> Result<Vec<String>> {
154        // Step 1: Tokenize
155        let tokens = self
156            .tokenizer
157            .tokenize(message)
158            .map_err(|e| anyhow!("Tokenization failed: {}", e))?;
159
160        // Step 2: Lowercase
161        let lowercase_tokens: Vec<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
162
163        // Step 3: Filter stop words
164        let filtered_tokens = self
165            .stop_words
166            .filter(&lowercase_tokens)
167            .map_err(|e| anyhow!("Stop words filtering failed: {}", e))?;
168
169        // Step 4: Stem
170        let stemmed_tokens = self
171            .stemmer
172            .stem_tokens(&filtered_tokens)
173            .map_err(|e| anyhow!("Stemming failed: {}", e))?;
174
175        Ok(stemmed_tokens)
176    }
177
178    /// Extract n-grams from a list of tokens.
179    ///
180    /// N-grams are contiguous sequences of n tokens.
181    /// Useful for detecting multi-word patterns like "null pointer" or "race condition".
182    ///
183    /// # Arguments
184    ///
185    /// * `tokens` - Input tokens
186    /// * `n` - Size of n-grams (1 = unigrams, 2 = bigrams, 3 = trigrams)
187    ///
188    /// # Returns
189    ///
190    /// * `Ok(Vec<String>)` - N-grams joined with underscores
191    /// * `Err` - If n is 0 or greater than token count
192    ///
193    /// # Examples
194    ///
195    /// ```rust
196    /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
197    ///
198    /// let processor = CommitMessageProcessor::new();
199    /// let tokens: Vec<String> = vec![
200    ///     "fix".to_string(),
201    ///     "race".to_string(),
202    ///     "condition".to_string(),
203    ///     "mutex".to_string(),
204    /// ];
205    /// let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
206    /// assert!(bigrams.contains(&"fix_race".to_string()));
207    /// assert!(bigrams.contains(&"race_condition".to_string()));
208    /// ```
209    pub fn extract_ngrams(&self, tokens: &[String], n: usize) -> Result<Vec<String>> {
210        if n == 0 {
211            return Err(anyhow!("n must be greater than 0"));
212        }
213
214        if tokens.len() < n {
215            return Ok(Vec::new());
216        }
217
218        let ngrams: Vec<String> = tokens.windows(n).map(|window| window.join("_")).collect();
219
220        Ok(ngrams)
221    }
222
223    /// Preprocess and extract both unigrams and bigrams.
224    ///
225    /// Convenience method that combines preprocessing with n-gram extraction.
226    /// Useful for feature extraction in ML models.
227    ///
228    /// # Arguments
229    ///
230    /// * `message` - Raw commit message
231    ///
232    /// # Returns
233    ///
234    /// * `Ok((Vec<String>, Vec<String>))` - (unigrams, bigrams)
235    /// * `Err` - If preprocessing fails
236    ///
237    /// # Examples
238    ///
239    /// ```rust
240    /// use organizational_intelligence_plugin::nlp::CommitMessageProcessor;
241    ///
242    /// let processor = CommitMessageProcessor::new();
243    /// let (unigrams, bigrams) = processor.preprocess_with_ngrams("fix: memory leak defect").unwrap();
244    /// assert!(unigrams.contains(&"memori".to_string())); // Stemmed "memory"
245    /// assert!(unigrams.contains(&"leak".to_string()));
246    /// assert!(!bigrams.is_empty()); // Should have bigrams
247    /// ```
248    pub fn preprocess_with_ngrams(&self, message: &str) -> Result<(Vec<String>, Vec<String>)> {
249        let tokens = self.preprocess(message)?;
250        let bigrams = self.extract_ngrams(&tokens, 2)?;
251
252        Ok((tokens, bigrams))
253    }
254}
255
256impl Default for CommitMessageProcessor {
257    fn default() -> Self {
258        Self::new()
259    }
260}
261
262/// TF-IDF feature extractor for commit messages
263///
264/// This extractor converts commit messages into TF-IDF feature vectors for ML classification.
265/// Implements Phase 2 of nlp-models-techniques-spec.md (Tier 2: TF-IDF + ML).
266///
267/// # Examples
268///
269/// ```rust
270/// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
271///
272/// let messages: Vec<String> = vec![
273///     "fix: null pointer dereference".to_string(),
274///     "fix: race condition in mutex".to_string(),
275///     "feat: add new feature".to_string(),
276/// ];
277///
278/// let mut extractor = TfidfFeatureExtractor::new(1500);
279/// let features = extractor.fit_transform(&messages).unwrap();
280///
281/// assert_eq!(features.n_rows(), 3); // 3 documents
282/// ```
283pub struct TfidfFeatureExtractor {
284    vectorizer: TfidfVectorizer,
285    max_features: usize,
286}
287
288impl TfidfFeatureExtractor {
289    /// Create a new TF-IDF feature extractor
290    ///
291    /// # Arguments
292    ///
293    /// * `max_features` - Maximum number of features (vocabulary size)
294    ///
295    /// # Examples
296    ///
297    /// ```rust
298    /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
299    ///
300    /// let extractor = TfidfFeatureExtractor::new(1500);
301    /// ```
302    pub fn new(max_features: usize) -> Self {
303        let vectorizer = TfidfVectorizer::new()
304            .with_tokenizer(Box::new(WordTokenizer::new()))
305            .with_lowercase(true)
306            .with_max_features(max_features);
307
308        Self {
309            vectorizer,
310            max_features,
311        }
312    }
313
314    /// Fit the vectorizer on training messages and transform them to TF-IDF features
315    ///
316    /// # Arguments
317    ///
318    /// * `messages` - Training commit messages
319    ///
320    /// # Returns
321    ///
322    /// * `Ok(Matrix<f64>)` - TF-IDF feature matrix (n_messages × vocabulary_size)
323    /// * `Err` - If vectorization fails
324    ///
325    /// # Examples
326    ///
327    /// ```rust
328    /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
329    ///
330    /// let messages: Vec<String> = vec![
331    ///     "fix: memory leak".to_string(),
332    ///     "fix: race condition".to_string(),
333    /// ];
334    ///
335    /// let mut extractor = TfidfFeatureExtractor::new(1000);
336    /// let features = extractor.fit_transform(&messages).unwrap();
337    ///
338    /// assert_eq!(features.n_rows(), 2);
339    /// ```
340    pub fn fit_transform(&mut self, messages: &[String]) -> Result<Matrix<f64>> {
341        self.vectorizer
342            .fit_transform(messages)
343            .map_err(|e| anyhow!("TF-IDF fit_transform failed: {}", e))
344    }
345
346    /// Fit the vectorizer on training messages
347    ///
348    /// # Arguments
349    ///
350    /// * `messages` - Training commit messages
351    ///
352    /// # Examples
353    ///
354    /// ```rust
355    /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
356    ///
357    /// let messages = vec![
358    ///     "fix: memory leak".to_string(),
359    ///     "fix: race condition".to_string(),
360    /// ];
361    ///
362    /// let mut extractor = TfidfFeatureExtractor::new(1000);
363    /// extractor.fit(&messages).unwrap();
364    /// ```
365    pub fn fit(&mut self, messages: &[String]) -> Result<()> {
366        self.vectorizer
367            .fit(messages)
368            .map_err(|e| anyhow!("TF-IDF fit failed: {}", e))
369    }
370
371    /// Transform messages to TF-IDF features using fitted vocabulary
372    ///
373    /// # Arguments
374    ///
375    /// * `messages` - Commit messages to transform
376    ///
377    /// # Returns
378    ///
379    /// * `Ok(Matrix<f64>)` - TF-IDF feature matrix
380    /// * `Err` - If transformation fails
381    ///
382    /// # Examples
383    ///
384    /// ```rust
385    /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
386    ///
387    /// let train_messages = vec![
388    ///     "fix: memory leak".to_string(),
389    ///     "fix: race condition".to_string(),
390    /// ];
391    ///
392    /// let test_messages = vec!["fix: null pointer".to_string()];
393    ///
394    /// let mut extractor = TfidfFeatureExtractor::new(1000);
395    /// extractor.fit(&train_messages).unwrap();
396    ///
397    /// let features = extractor.transform(&test_messages).unwrap();
398    /// assert_eq!(features.n_rows(), 1);
399    /// ```
400    pub fn transform(&self, messages: &[String]) -> Result<Matrix<f64>> {
401        self.vectorizer
402            .transform(messages)
403            .map_err(|e| anyhow!("TF-IDF transform failed: {}", e))
404    }
405
406    /// Get the vocabulary size (number of features)
407    ///
408    /// # Returns
409    ///
410    /// * `usize` - Number of features in vocabulary
411    ///
412    /// # Examples
413    ///
414    /// ```rust
415    /// use organizational_intelligence_plugin::nlp::TfidfFeatureExtractor;
416    ///
417    /// let messages = vec![
418    ///     "fix: bug".to_string(),
419    ///     "feat: feature".to_string(),
420    /// ];
421    ///
422    /// let mut extractor = TfidfFeatureExtractor::new(1000);
423    /// extractor.fit(&messages).unwrap();
424    ///
425    /// assert!(extractor.vocabulary_size() > 0);
426    /// assert!(extractor.vocabulary_size() <= 1000);
427    /// ```
428    pub fn vocabulary_size(&self) -> usize {
429        self.vectorizer.vocabulary_size()
430    }
431
432    /// Get the maximum features configuration
433    ///
434    /// # Returns
435    ///
436    /// * `usize` - Maximum number of features
437    pub fn max_features(&self) -> usize {
438        self.max_features
439    }
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445
446    #[test]
447    fn test_processor_creation() {
448        let _processor = CommitMessageProcessor::new();
449        let _processor2 = CommitMessageProcessor::default();
450    }
451
452    #[test]
453    fn test_basic_preprocessing() {
454        let processor = CommitMessageProcessor::new();
455        let message = "fix: memory leak detected";
456        let tokens = processor.preprocess(message).unwrap();
457
458        // Should contain key technical terms (stemmed)
459        // "memory" -> "memori" (stemmed), "leak" stays "leak", "detect" -> "detect"
460        assert!(tokens
461            .iter()
462            .any(|t| t.starts_with("memori") || t.starts_with("memory")));
463        assert!(tokens.iter().any(|t| t.starts_with("leak")));
464        assert!(tokens.iter().any(|t| t.starts_with("detect")));
465    }
466
467    #[test]
468    fn test_preprocessing_handles_punctuation() {
469        let processor = CommitMessageProcessor::new();
470        let message = "fix race condition mutex lock";
471        let tokens = processor.preprocess(message).unwrap();
472
473        // Should contain technical terms without punctuation complications
474        assert!(tokens
475            .iter()
476            .any(|t| t.starts_with("race") || t.starts_with("rac")));
477        assert!(tokens
478            .iter()
479            .any(|t| t.starts_with("condit") || t.starts_with("condition")));
480        assert!(tokens.iter().any(|t| t.starts_with("mutex")));
481        assert!(tokens.iter().any(|t| t.starts_with("lock")));
482    }
483
484    #[test]
485    fn test_ngram_extraction() {
486        let processor = CommitMessageProcessor::new();
487        let tokens = vec![
488            "fix".to_string(),
489            "race".to_string(),
490            "condition".to_string(),
491        ];
492
493        let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
494        assert_eq!(bigrams.len(), 2);
495        assert!(bigrams.contains(&"fix_race".to_string()));
496        assert!(bigrams.contains(&"race_condition".to_string()));
497    }
498
499    #[test]
500    fn test_ngram_extraction_trigrams() {
501        let processor = CommitMessageProcessor::new();
502        let tokens = vec![
503            "fix".to_string(),
504            "null".to_string(),
505            "pointer".to_string(),
506            "dereference".to_string(),
507        ];
508
509        let trigrams = processor.extract_ngrams(&tokens, 3).unwrap();
510        assert_eq!(trigrams.len(), 2);
511        assert!(trigrams.contains(&"fix_null_pointer".to_string()));
512        assert!(trigrams.contains(&"null_pointer_dereference".to_string()));
513    }
514
515    #[test]
516    fn test_ngram_empty_tokens() {
517        let processor = CommitMessageProcessor::new();
518        let tokens: Vec<String> = vec![];
519
520        let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
521        assert!(bigrams.is_empty());
522    }
523
524    #[test]
525    fn test_ngram_insufficient_tokens() {
526        let processor = CommitMessageProcessor::new();
527        let tokens = vec!["single".to_string()];
528
529        let bigrams = processor.extract_ngrams(&tokens, 2).unwrap();
530        assert!(bigrams.is_empty());
531    }
532
533    #[test]
534    fn test_ngram_zero_n_error() {
535        let processor = CommitMessageProcessor::new();
536        let tokens = vec!["test".to_string()];
537
538        let result = processor.extract_ngrams(&tokens, 0);
539        assert!(result.is_err());
540    }
541
542    #[test]
543    fn test_preprocess_with_ngrams() {
544        let processor = CommitMessageProcessor::new();
545        let message = "fix memory leak in parser";
546
547        let (unigrams, bigrams) = processor.preprocess_with_ngrams(message).unwrap();
548
549        assert!(!unigrams.is_empty());
550        assert!(!bigrams.is_empty());
551    }
552
553    #[test]
554    fn test_custom_stop_words() {
555        let processor = CommitMessageProcessor::with_custom_stop_words(vec!["custom", "stop"]);
556        let message = "custom test stop words";
557
558        let tokens = processor.preprocess(message).unwrap();
559
560        // "custom" and "stop" should be filtered
561        assert!(!tokens.contains(&"custom".to_string()));
562        assert!(!tokens.contains(&"stop".to_string()));
563        // "test" and "words" should remain
564        assert!(tokens.iter().any(|t| t.starts_with("test")));
565    }
566
567    #[test]
568    fn test_preprocessing_with_code_tokens() {
569        let processor = CommitMessageProcessor::new();
570        let message = "fix: parse_expr() null check in into_iter()";
571
572        let tokens = processor.preprocess(message).unwrap();
573
574        // Code identifiers should be tokenized
575        assert!(tokens
576            .iter()
577            .any(|t| t.contains("pars") || t.contains("expr")));
578        assert!(tokens.iter().any(|t| t.contains("null")));
579    }
580
581    #[test]
582    fn test_stemming_normalization() {
583        let processor = CommitMessageProcessor::new();
584        let message1 = "fixing bugs";
585        let message2 = "fixed bug";
586
587        let tokens1 = processor.preprocess(message1).unwrap();
588        let tokens2 = processor.preprocess(message2).unwrap();
589
590        // Both should stem "fix" and "bug" similarly
591        let has_fix_stem1 = tokens1.iter().any(|t| t.starts_with("fix"));
592        let has_fix_stem2 = tokens2.iter().any(|t| t.starts_with("fix"));
593        let has_bug_stem1 = tokens1.iter().any(|t| t.starts_with("bug"));
594        let has_bug_stem2 = tokens2.iter().any(|t| t.starts_with("bug"));
595
596        assert!(has_fix_stem1 || has_fix_stem2);
597        assert!(has_bug_stem1 || has_bug_stem2);
598    }
599
600    #[test]
601    fn test_empty_message() {
602        let processor = CommitMessageProcessor::new();
603        let tokens = processor.preprocess("").unwrap();
604        assert!(tokens.is_empty());
605    }
606
607    #[test]
608    fn test_whitespace_only_message() {
609        let processor = CommitMessageProcessor::new();
610        let tokens = processor.preprocess("   \t\n   ").unwrap();
611        assert!(tokens.is_empty());
612    }
613
614    // TF-IDF feature extraction tests
615
616    #[test]
617    fn test_tfidf_extractor_creation() {
618        let extractor = TfidfFeatureExtractor::new(1000);
619        assert_eq!(extractor.max_features(), 1000);
620    }
621
622    #[test]
623    fn test_tfidf_fit_transform_basic() {
624        let messages = vec![
625            "fix: memory leak".to_string(),
626            "fix: race condition".to_string(),
627            "feat: add new feature".to_string(),
628        ];
629
630        let mut extractor = TfidfFeatureExtractor::new(1000);
631        let features = extractor.fit_transform(&messages).unwrap();
632
633        // Should produce matrix with correct dimensions
634        assert_eq!(features.n_rows(), 3); // 3 documents
635        assert!(features.n_cols() > 0); // At least some features
636        assert!(features.n_cols() <= 1000); // Respects max_features
637    }
638
639    #[test]
640    fn test_tfidf_fit_and_transform_separate() {
641        let train_messages = vec![
642            "fix: memory leak".to_string(),
643            "fix: race condition".to_string(),
644        ];
645
646        let test_messages = vec!["fix: null pointer".to_string()];
647
648        let mut extractor = TfidfFeatureExtractor::new(1000);
649
650        // Fit on training data
651        extractor.fit(&train_messages).unwrap();
652
653        // Transform test data
654        let features = extractor.transform(&test_messages).unwrap();
655
656        assert_eq!(features.n_rows(), 1);
657        assert_eq!(features.n_cols(), extractor.vocabulary_size());
658    }
659
660    #[test]
661    fn test_tfidf_vocabulary_size() {
662        let messages = vec![
663            "fix bug".to_string(),
664            "feat feature".to_string(),
665            "test code".to_string(),
666        ];
667
668        let mut extractor = TfidfFeatureExtractor::new(1000);
669        extractor.fit(&messages).unwrap();
670
671        let vocab_size = extractor.vocabulary_size();
672        assert!(vocab_size > 0);
673        assert!(vocab_size <= 1000); // Respects max_features
674    }
675
676    #[test]
677    fn test_tfidf_max_features_limit() {
678        let messages = vec![
679            "word1 word2 word3 word4 word5".to_string(),
680            "word6 word7 word8 word9 word10".to_string(),
681            "word11 word12 word13 word14 word15".to_string(),
682        ];
683
684        // Limit to 5 features
685        let mut extractor = TfidfFeatureExtractor::new(5);
686        extractor.fit(&messages).unwrap();
687
688        assert!(extractor.vocabulary_size() <= 5);
689    }
690
691    #[test]
692    fn test_tfidf_with_real_commit_messages() {
693        let messages = vec![
694            "fix: null pointer dereference in parser".to_string(),
695            "fix: race condition in mutex lock".to_string(),
696            "feat: add TF-IDF feature extraction".to_string(),
697            "docs: update README with examples".to_string(),
698            "test: add unit tests for classifier".to_string(),
699        ];
700
701        let mut extractor = TfidfFeatureExtractor::new(1500);
702        let features = extractor.fit_transform(&messages).unwrap();
703
704        assert_eq!(features.n_rows(), 5);
705        assert!(features.n_cols() > 0);
706
707        // Check that feature values are reasonable (non-negative)
708        for row in 0..features.n_rows() {
709            for col in 0..features.n_cols() {
710                assert!(features.get(row, col) >= 0.0);
711            }
712        }
713    }
714
715    #[test]
716    fn test_tfidf_empty_messages() {
717        let messages: Vec<String> = vec![];
718
719        let mut extractor = TfidfFeatureExtractor::new(1000);
720        let result = extractor.fit_transform(&messages);
721
722        // Should handle empty input gracefully
723        assert!(result.is_ok() || result.is_err());
724    }
725
726    #[test]
727    fn test_tfidf_single_message() {
728        let messages = vec!["fix: single message".to_string()];
729
730        let mut extractor = TfidfFeatureExtractor::new(1000);
731        let features = extractor.fit_transform(&messages).unwrap();
732
733        assert_eq!(features.n_rows(), 1);
734        assert!(features.n_cols() > 0);
735    }
736
737    #[test]
738    fn test_tfidf_duplicate_messages() {
739        let messages = vec![
740            "fix: memory leak".to_string(),
741            "fix: memory leak".to_string(),
742            "fix: memory leak".to_string(),
743        ];
744
745        let mut extractor = TfidfFeatureExtractor::new(1000);
746        let features = extractor.fit_transform(&messages).unwrap();
747
748        assert_eq!(features.n_rows(), 3);
749
750        // Duplicate messages should have similar (but not identical due to IDF) feature vectors
751        // Just verify they transform successfully
752        assert!(features.n_cols() > 0);
753    }
754
755    #[test]
756    fn test_tfidf_transform_new_data() {
757        let train_messages = vec![
758            "fix: memory leak".to_string(),
759            "fix: race condition".to_string(),
760            "feat: new feature".to_string(),
761        ];
762
763        let test_messages = vec![
764            "fix: another memory issue".to_string(),
765            "feat: different feature".to_string(),
766        ];
767
768        let mut extractor = TfidfFeatureExtractor::new(1000);
769        extractor.fit(&train_messages).unwrap();
770
771        let test_features = extractor.transform(&test_messages).unwrap();
772
773        assert_eq!(test_features.n_rows(), 2);
774        assert_eq!(test_features.n_cols(), extractor.vocabulary_size());
775    }
776
777    #[test]
778    fn test_tfidf_configuration() {
779        let extractor = TfidfFeatureExtractor::new(1500);
780
781        assert_eq!(extractor.max_features(), 1500);
782    }
783
784    #[test]
785    fn test_tfidf_with_software_terms() {
786        let messages = vec![
787            "fix: null pointer dereference".to_string(),
788            "fix: buffer overflow in parse_expr".to_string(),
789            "fix: race condition deadlock".to_string(),
790            "fix: memory leak in allocator".to_string(),
791        ];
792
793        let mut extractor = TfidfFeatureExtractor::new(1000);
794        let features = extractor.fit_transform(&messages).unwrap();
795
796        assert_eq!(features.n_rows(), 4);
797
798        // Verify that technical terms are captured
799        // (vocabulary building works correctly)
800        assert!(extractor.vocabulary_size() > 0);
801    }
802
803    #[test]
804    fn test_tfidf_transpiler_specific_terms() {
805        let messages = vec![
806            "fix: operator precedence bug".to_string(),
807            "fix: AST transform error".to_string(),
808            "fix: lifetime parameter issue".to_string(),
809            "fix: trait bound constraint".to_string(),
810        ];
811
812        let mut extractor = TfidfFeatureExtractor::new(1500);
813        let features = extractor.fit_transform(&messages).unwrap();
814
815        assert_eq!(features.n_rows(), 4);
816        assert!(extractor.vocabulary_size() > 0);
817    }
818}