organizational_intelligence_plugin/
features.rs

1//! Feature Extraction for GPU Processing
2//!
3//! Implements Section 4.3: Feature Extraction
4//! Converts OIP defect classifications into GPU-friendly numerical features
5//!
6//! OPT-001: Integrated BatchProcessor for efficient bulk extraction
7
8use crate::citl::{ErrorCodeClass, SuggestionApplicability};
9use crate::perf::{BatchProcessor, PerfStats};
10use anyhow::Result;
11use chrono::{Datelike, Timelike};
12use serde::{Deserialize, Serialize};
13use std::time::Instant;
14
15/// Commit features optimized for GPU processing
16///
17/// NLP-014: Extended from 8 to 14 dimensions for CITL integration
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct CommitFeatures {
20    // Categorical (one-hot encoded for GPU)
21    pub defect_category: u8, // 0-17 (18 categories from OIP)
22
23    // Numerical (GPU-native f32)
24    pub files_changed: f32,
25    pub lines_added: f32,
26    pub lines_deleted: f32,
27    pub complexity_delta: f32, // Cyclomatic complexity change
28
29    // Temporal
30    pub timestamp: f64,  // Unix epoch
31    pub hour_of_day: u8, // 0-23 (circadian patterns)
32    pub day_of_week: u8, // 0-6
33
34    // NLP-014: CITL features (6 new dims)
35    /// Error code class: 0=type, 1=borrow, 2=name, 3=trait, 4=other
36    #[serde(default)]
37    pub error_code_class: u8,
38    /// Whether a suggestion was provided: 0 or 1
39    #[serde(default)]
40    pub has_suggestion: u8,
41    /// Suggestion applicability: 0=none, 1=machine, 2=maybe, 3=placeholder
42    #[serde(default)]
43    pub suggestion_applicability: u8,
44    /// Count of clippy lints (0-255)
45    #[serde(default)]
46    pub clippy_lint_count: u8,
47    /// Distance from function start (normalized span line delta)
48    #[serde(default)]
49    pub span_line_delta: f32,
50    /// Diagnostic confidence from taxonomy mapping
51    #[serde(default)]
52    pub diagnostic_confidence: f32,
53}
54
55impl CommitFeatures {
56    /// Convert to flat vector for GPU processing
57    ///
58    /// Fixed-size vector enables efficient GPU batching
59    /// NLP-014: Extended to 14 dimensions
60    pub fn to_vector(&self) -> Vec<f32> {
61        vec![
62            self.defect_category as f32,
63            self.files_changed,
64            self.lines_added,
65            self.lines_deleted,
66            self.complexity_delta,
67            self.timestamp as f32,
68            self.hour_of_day as f32,
69            self.day_of_week as f32,
70            // NLP-014: CITL features
71            self.error_code_class as f32,
72            self.has_suggestion as f32,
73            self.suggestion_applicability as f32,
74            self.clippy_lint_count as f32,
75            self.span_line_delta,
76            self.diagnostic_confidence,
77        ]
78    }
79
80    /// Vector dimension count (for GPU buffer allocation)
81    /// NLP-014: Extended from 8 to 14 dimensions
82    pub const DIMENSION: usize = 14;
83}
84
85impl Default for CommitFeatures {
86    fn default() -> Self {
87        Self {
88            defect_category: 0,
89            files_changed: 0.0,
90            lines_added: 0.0,
91            lines_deleted: 0.0,
92            complexity_delta: 0.0,
93            timestamp: 0.0,
94            hour_of_day: 0,
95            day_of_week: 0,
96            error_code_class: ErrorCodeClass::Other.as_u8(),
97            has_suggestion: 0,
98            suggestion_applicability: SuggestionApplicability::None.as_u8(),
99            clippy_lint_count: 0,
100            span_line_delta: 0.0,
101            diagnostic_confidence: 0.0,
102        }
103    }
104}
105
106/// Extract features from OIP defect record
107pub struct FeatureExtractor;
108
109impl FeatureExtractor {
110    pub fn new() -> Self {
111        Self
112    }
113
114    /// Extract features from defect category and metadata
115    ///
116    /// Uses default values for CITL fields (backwards compatible)
117    pub fn extract(
118        &self,
119        category: u8,
120        files_changed: usize,
121        lines_added: usize,
122        lines_deleted: usize,
123        timestamp: i64,
124    ) -> Result<CommitFeatures> {
125        self.extract_with_citl(
126            category,
127            files_changed,
128            lines_added,
129            lines_deleted,
130            timestamp,
131            ErrorCodeClass::Other,
132            false,
133            SuggestionApplicability::None,
134            0,
135            0.0,
136            0.0,
137        )
138    }
139
140    /// Extract features with CITL diagnostic information (NLP-014)
141    #[allow(clippy::too_many_arguments)]
142    pub fn extract_with_citl(
143        &self,
144        category: u8,
145        files_changed: usize,
146        lines_added: usize,
147        lines_deleted: usize,
148        timestamp: i64,
149        error_code_class: ErrorCodeClass,
150        has_suggestion: bool,
151        suggestion_applicability: SuggestionApplicability,
152        clippy_lint_count: u8,
153        span_line_delta: f32,
154        diagnostic_confidence: f32,
155    ) -> Result<CommitFeatures> {
156        // Convert timestamp to hour/day
157        let datetime = chrono::DateTime::from_timestamp(timestamp, 0)
158            .ok_or_else(|| anyhow::anyhow!("Invalid timestamp"))?;
159
160        let hour_of_day = datetime.hour() as u8;
161        let day_of_week = datetime.weekday().num_days_from_monday() as u8;
162
163        Ok(CommitFeatures {
164            defect_category: category,
165            files_changed: files_changed as f32,
166            lines_added: lines_added as f32,
167            lines_deleted: lines_deleted as f32,
168            complexity_delta: 0.0, // Will compute in future iteration
169            timestamp: timestamp as f64,
170            hour_of_day,
171            day_of_week,
172            // NLP-014: CITL features
173            error_code_class: error_code_class.as_u8(),
174            has_suggestion: u8::from(has_suggestion),
175            suggestion_applicability: suggestion_applicability.as_u8(),
176            clippy_lint_count,
177            span_line_delta,
178            diagnostic_confidence,
179        })
180    }
181}
182
183impl Default for FeatureExtractor {
184    fn default() -> Self {
185        Self::new()
186    }
187}
188
189/// Input data for batch feature extraction
190#[derive(Debug, Clone)]
191pub struct FeatureInput {
192    pub category: u8,
193    pub files_changed: usize,
194    pub lines_added: usize,
195    pub lines_deleted: usize,
196    pub timestamp: i64,
197}
198
199/// Batch feature extractor with performance tracking
200///
201/// OPT-001: Uses BatchProcessor for efficient bulk extraction
202pub struct BatchFeatureExtractor {
203    extractor: FeatureExtractor,
204    batch_processor: BatchProcessor<FeatureInput>,
205    stats: PerfStats,
206}
207
208impl BatchFeatureExtractor {
209    /// Create batch extractor with default batch size (1000)
210    pub fn new() -> Self {
211        Self::with_batch_size(1000)
212    }
213
214    /// Create batch extractor with custom batch size
215    pub fn with_batch_size(batch_size: usize) -> Self {
216        Self {
217            extractor: FeatureExtractor::new(),
218            batch_processor: BatchProcessor::new(batch_size),
219            stats: PerfStats::new(),
220        }
221    }
222
223    /// Add input to batch, returns extracted features if batch is full
224    pub fn add(&mut self, input: FeatureInput) -> Option<Vec<CommitFeatures>> {
225        self.batch_processor
226            .add(input)
227            .map(|batch| self.extract_batch(batch))
228    }
229
230    /// Flush remaining inputs and extract features
231    pub fn flush(&mut self) -> Vec<CommitFeatures> {
232        let batch = self.batch_processor.flush();
233        if batch.is_empty() {
234            Vec::new()
235        } else {
236            self.extract_batch(batch)
237        }
238    }
239
240    /// Extract features from batch with performance tracking
241    fn extract_batch(&mut self, inputs: Vec<FeatureInput>) -> Vec<CommitFeatures> {
242        let start = Instant::now();
243
244        let features: Vec<CommitFeatures> = inputs
245            .into_iter()
246            .filter_map(|input| {
247                self.extractor
248                    .extract(
249                        input.category,
250                        input.files_changed,
251                        input.lines_added,
252                        input.lines_deleted,
253                        input.timestamp,
254                    )
255                    .ok()
256            })
257            .collect();
258
259        let duration_ns = start.elapsed().as_nanos() as u64;
260        self.stats.record(duration_ns);
261
262        features
263    }
264
265    /// Extract all features at once (convenience method)
266    pub fn extract_all(&mut self, inputs: Vec<FeatureInput>) -> Vec<CommitFeatures> {
267        let start = Instant::now();
268
269        let features: Vec<CommitFeatures> = inputs
270            .into_iter()
271            .filter_map(|input| {
272                self.extractor
273                    .extract(
274                        input.category,
275                        input.files_changed,
276                        input.lines_added,
277                        input.lines_deleted,
278                        input.timestamp,
279                    )
280                    .ok()
281            })
282            .collect();
283
284        let duration_ns = start.elapsed().as_nanos() as u64;
285        self.stats.record(duration_ns);
286
287        features
288    }
289
290    /// Get performance statistics
291    pub fn stats(&self) -> &PerfStats {
292        &self.stats
293    }
294
295    /// Get pending item count
296    pub fn pending(&self) -> usize {
297        self.batch_processor.len()
298    }
299}
300
301impl Default for BatchFeatureExtractor {
302    fn default() -> Self {
303        Self::new()
304    }
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310
311    #[test]
312    fn test_feature_extractor_creation() {
313        let _extractor = FeatureExtractor::new();
314        // Extractor is zero-sized type, just verify it compiles
315    }
316
317    #[test]
318    fn test_extract_basic_features() {
319        let extractor = FeatureExtractor::new();
320
321        // Category 2, 3 files, 100 lines added, 50 deleted
322        let features = extractor
323            .extract(
324                2, 3, 100, 50, 1700000000, // 2023-11-14
325            )
326            .unwrap();
327
328        assert_eq!(features.defect_category, 2);
329        assert_eq!(features.files_changed, 3.0);
330        assert_eq!(features.lines_added, 100.0);
331        assert_eq!(features.lines_deleted, 50.0);
332    }
333
334    #[test]
335    fn test_to_vector_dimension() {
336        let features = CommitFeatures {
337            defect_category: 1,
338            files_changed: 2.0,
339            lines_added: 10.0,
340            lines_deleted: 5.0,
341            complexity_delta: 0.0,
342            timestamp: 1700000000.0,
343            hour_of_day: 14,
344            day_of_week: 2,
345            // NLP-014: CITL fields
346            error_code_class: 0,
347            has_suggestion: 1,
348            suggestion_applicability: 2,
349            clippy_lint_count: 3,
350            span_line_delta: 4.5,
351            diagnostic_confidence: 0.95,
352        };
353
354        let vec = features.to_vector();
355        assert_eq!(vec.len(), CommitFeatures::DIMENSION);
356        assert_eq!(vec.len(), 14); // NLP-014: 14 dimensions
357        assert_eq!(vec[0], 1.0); // category
358        assert_eq!(vec[1], 2.0); // files
359        assert_eq!(vec[2], 10.0); // lines added
360                                  // NLP-014: CITL features
361        assert_eq!(vec[8], 0.0); // error_code_class
362        assert_eq!(vec[9], 1.0); // has_suggestion
363        assert_eq!(vec[10], 2.0); // suggestion_applicability
364        assert_eq!(vec[11], 3.0); // clippy_lint_count
365        assert!((vec[12] - 4.5).abs() < 0.001); // span_line_delta
366        assert!((vec[13] - 0.95).abs() < 0.001); // diagnostic_confidence
367    }
368
369    #[test]
370    fn test_temporal_features() {
371        let extractor = FeatureExtractor::new();
372
373        // Known timestamp: 2023-11-14 14:30:00 UTC (Tuesday)
374        let features = extractor.extract(0, 1, 1, 1, 1699971000).unwrap();
375
376        assert_eq!(features.hour_of_day, 14);
377        assert_eq!(features.day_of_week, 1); // Tuesday (0=Mon, 1=Tue)
378    }
379
380    #[test]
381    fn test_invalid_timestamp() {
382        let extractor = FeatureExtractor::new();
383
384        // Out of range timestamp should error
385        let result = extractor.extract(0, 1, 1, 1, i64::MAX);
386        assert!(result.is_err());
387    }
388
389    #[test]
390    fn test_batch_extractor_creation() {
391        let extractor = BatchFeatureExtractor::new();
392        assert_eq!(extractor.pending(), 0);
393    }
394
395    #[test]
396    fn test_batch_extractor_add() {
397        let mut extractor = BatchFeatureExtractor::with_batch_size(3);
398
399        let input1 = FeatureInput {
400            category: 0,
401            files_changed: 1,
402            lines_added: 10,
403            lines_deleted: 5,
404            timestamp: 1700000000,
405        };
406
407        // First two shouldn't trigger extraction
408        assert!(extractor.add(input1.clone()).is_none());
409        assert!(extractor.add(input1.clone()).is_none());
410        assert_eq!(extractor.pending(), 2);
411
412        // Third should trigger
413        let batch = extractor.add(input1);
414        assert!(batch.is_some());
415        assert_eq!(batch.unwrap().len(), 3);
416        assert_eq!(extractor.pending(), 0);
417    }
418
419    #[test]
420    fn test_batch_extractor_flush() {
421        let mut extractor = BatchFeatureExtractor::with_batch_size(10);
422
423        let input = FeatureInput {
424            category: 1,
425            files_changed: 2,
426            lines_added: 20,
427            lines_deleted: 10,
428            timestamp: 1700000000,
429        };
430
431        extractor.add(input.clone());
432        extractor.add(input.clone());
433        extractor.add(input);
434
435        let remaining = extractor.flush();
436        assert_eq!(remaining.len(), 3);
437        assert_eq!(extractor.pending(), 0);
438    }
439
440    #[test]
441    fn test_batch_extractor_extract_all() {
442        let mut extractor = BatchFeatureExtractor::new();
443
444        let inputs: Vec<FeatureInput> = (0..5)
445            .map(|i| FeatureInput {
446                category: i as u8,
447                files_changed: i + 1,
448                lines_added: (i + 1) * 10,
449                lines_deleted: (i + 1) * 5,
450                timestamp: 1700000000 + i as i64,
451            })
452            .collect();
453
454        let features = extractor.extract_all(inputs);
455        assert_eq!(features.len(), 5);
456        assert_eq!(features[0].defect_category, 0);
457        assert_eq!(features[4].defect_category, 4);
458    }
459
460    #[test]
461    fn test_batch_extractor_stats() {
462        let mut extractor = BatchFeatureExtractor::new();
463
464        let inputs: Vec<FeatureInput> = (0..100)
465            .map(|i| FeatureInput {
466                category: (i % 10) as u8,
467                files_changed: i + 1,
468                lines_added: (i + 1) * 10,
469                lines_deleted: (i + 1) * 5,
470                timestamp: 1700000000 + i as i64,
471            })
472            .collect();
473
474        extractor.extract_all(inputs);
475
476        let stats = extractor.stats();
477        assert_eq!(stats.operation_count, 1);
478        assert!(stats.avg_ns() > 0);
479    }
480}