Skip to main content

valknut_rs/core/
featureset.rs

1//! Feature extraction framework and data structures.
2//!
3//! This module provides the core abstractions for feature extraction in valknut-rs,
4//! including feature definitions, extractors, and feature vectors. The design emphasizes
5//! performance and type safety while maintaining compatibility with the Python implementation.
6
7use std::collections::HashMap;
8use std::sync::Arc;
9
10use async_trait::async_trait;
11use serde::{Deserialize, Serialize};
12
13use crate::core::errors::{Result, ValknutError};
14
15#[cfg(test)]
16#[path = "featureset_tests.rs"]
17mod tests;
18
19/// Unique identifier for entities in the system
20pub type EntityId = String;
21
22/// Definition of a feature that can be extracted from code entities.
23#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
24pub struct FeatureDefinition {
25    /// Unique name of the feature
26    pub name: String,
27
28    /// Human-readable description of what this feature measures
29    pub description: String,
30
31    /// Data type of the feature value (for serialization metadata)
32    pub data_type: String,
33
34    /// Minimum expected value (for normalization)
35    pub min_value: Option<f64>,
36
37    /// Maximum expected value (for normalization)
38    pub max_value: Option<f64>,
39
40    /// Default value when feature cannot be computed
41    pub default_value: f64,
42
43    /// True if higher values indicate more refactoring need
44    pub higher_is_worse: bool,
45}
46
47/// Factory and configuration methods for [`FeatureDefinition`].
48impl FeatureDefinition {
49    /// Create a new feature definition
50    pub fn new(name: impl Into<String>, description: impl Into<String>) -> Self {
51        Self {
52            name: name.into(),
53            description: description.into(),
54            data_type: "f64".to_string(),
55            min_value: None,
56            max_value: None,
57            default_value: 0.0,
58            higher_is_worse: true,
59        }
60    }
61
62    /// Set the value range for this feature
63    pub fn with_range(mut self, min_value: f64, max_value: f64) -> Self {
64        self.min_value = Some(min_value);
65        self.max_value = Some(max_value);
66        self
67    }
68
69    /// Set the default value for this feature
70    pub fn with_default(mut self, default_value: f64) -> Self {
71        self.default_value = default_value;
72        self
73    }
74
75    /// Set whether higher values are worse (default: true)
76    pub fn with_polarity(mut self, higher_is_worse: bool) -> Self {
77        self.higher_is_worse = higher_is_worse;
78        self
79    }
80
81    /// Check if a value is within the expected range
82    pub fn is_valid_value(&self, value: f64) -> bool {
83        if value.is_nan() || value.is_infinite() {
84            return false;
85        }
86
87        if let Some(min) = self.min_value {
88            if value < min {
89                return false;
90            }
91        }
92
93        if let Some(max) = self.max_value {
94            if value > max {
95                return false;
96            }
97        }
98
99        true
100    }
101
102    /// Clamp a value to the valid range
103    pub fn clamp_value(&self, value: f64) -> f64 {
104        if value.is_nan() || value.is_infinite() {
105            return self.default_value;
106        }
107
108        let mut clamped = value;
109
110        if let Some(min) = self.min_value {
111            if clamped < min {
112                clamped = min;
113            }
114        }
115
116        if let Some(max) = self.max_value {
117            if clamped > max {
118                clamped = max;
119            }
120        }
121
122        clamped
123    }
124}
125
126/// Container for an entity's computed feature vector.
127#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct FeatureVector {
129    /// Unique identifier for the entity
130    pub entity_id: EntityId,
131
132    /// Raw feature values as computed by extractors
133    pub features: HashMap<String, f64>,
134
135    /// Normalized feature values (after scoring pipeline)
136    pub normalized_features: HashMap<String, f64>,
137
138    /// Additional metadata about the entity or extraction process
139    pub metadata: HashMap<String, serde_json::Value>,
140
141    /// Refactoring suggestions generated during analysis
142    pub refactoring_suggestions: Vec<RefactoringSuggestion>,
143}
144
145/// Factory, mutation, and analysis methods for [`FeatureVector`].
146impl FeatureVector {
147    /// Create a new empty feature vector for an entity
148    pub fn new(entity_id: impl Into<EntityId>) -> Self {
149        Self {
150            entity_id: entity_id.into(),
151            features: HashMap::new(),
152            normalized_features: HashMap::new(),
153            metadata: HashMap::new(),
154            refactoring_suggestions: Vec::new(),
155        }
156    }
157
158    /// Add a feature value to the vector
159    pub fn add_feature(&mut self, name: impl Into<String>, value: f64) -> &mut Self {
160        self.features.insert(name.into(), value);
161        self
162    }
163
164    /// Get a feature value by name
165    pub fn get_feature(&self, name: &str) -> Option<f64> {
166        self.features.get(name).copied()
167    }
168
169    /// Get a normalized feature value by name
170    pub fn get_normalized_feature(&self, name: &str) -> Option<f64> {
171        self.normalized_features.get(name).copied()
172    }
173
174    /// Add metadata for the entity
175    pub fn add_metadata(&mut self, key: impl Into<String>, value: serde_json::Value) -> &mut Self {
176        self.metadata.insert(key.into(), value);
177        self
178    }
179
180    /// Add a refactoring suggestion
181    pub fn add_suggestion(&mut self, suggestion: RefactoringSuggestion) -> &mut Self {
182        self.refactoring_suggestions.push(suggestion);
183        self
184    }
185
186    /// Get the number of features in this vector
187    pub fn feature_count(&self) -> usize {
188        self.features.len()
189    }
190
191    /// Check if the vector contains a specific feature
192    pub fn has_feature(&self, name: &str) -> bool {
193        self.features.contains_key(name)
194    }
195
196    /// Get all feature names
197    pub fn feature_names(&self) -> impl Iterator<Item = &String> {
198        self.features.keys()
199    }
200
201    /// Compute the L2 norm of the feature vector
202    pub fn l2_norm(&self) -> f64 {
203        self.features.values().map(|v| v * v).sum::<f64>().sqrt()
204    }
205
206    /// Compute cosine similarity with another feature vector
207    pub fn cosine_similarity(&self, other: &Self) -> f64 {
208        let mut dot_product = 0.0;
209        let mut norm_self_squared = 0.0;
210        let mut norm_other_squared = 0.0;
211
212        // Compute dot product and norms over shared features
213        for (name, &value_a) in &self.features {
214            norm_self_squared += value_a * value_a;
215
216            if let Some(&value_b) = other.features.get(name) {
217                dot_product += value_a * value_b;
218            }
219        }
220
221        for &value_b in other.features.values() {
222            norm_other_squared += value_b * value_b;
223        }
224
225        let denominator = (norm_self_squared * norm_other_squared).sqrt();
226        if denominator == 0.0 {
227            0.0
228        } else {
229            dot_product / denominator
230        }
231    }
232}
233
234/// Refactoring suggestion with priority and description
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct RefactoringSuggestion {
237    /// Type of refactoring suggested
238    pub refactoring_type: String,
239
240    /// Human-readable description of the suggestion
241    pub description: String,
242
243    /// Priority level (0.0 = low, 1.0 = critical)
244    pub priority: f64,
245
246    /// Confidence in the suggestion (0.0 = uncertain, 1.0 = high confidence)
247    pub confidence: f64,
248
249    /// Location information (file path, line numbers, etc.)
250    pub location: Option<serde_json::Value>,
251
252    /// Additional context or reasoning
253    pub context: Option<String>,
254}
255
256/// Factory and builder methods for [`RefactoringSuggestion`].
257impl RefactoringSuggestion {
258    /// Create a new refactoring suggestion
259    pub fn new(
260        refactoring_type: impl Into<String>,
261        description: impl Into<String>,
262        priority: f64,
263        confidence: f64,
264    ) -> Self {
265        Self {
266            refactoring_type: refactoring_type.into(),
267            description: description.into(),
268            priority: priority.clamp(0.0, 1.0),
269            confidence: confidence.clamp(0.0, 1.0),
270            location: None,
271            context: None,
272        }
273    }
274
275    /// Add location information to the suggestion
276    pub fn with_location(mut self, location: serde_json::Value) -> Self {
277        self.location = Some(location);
278        self
279    }
280
281    /// Add context to the suggestion
282    pub fn with_context(mut self, context: impl Into<String>) -> Self {
283        self.context = Some(context.into());
284        self
285    }
286
287    /// Check if this suggestion is high priority
288    pub fn is_high_priority(&self) -> bool {
289        self.priority >= 0.7
290    }
291
292    /// Check if this suggestion is high confidence
293    pub fn is_high_confidence(&self) -> bool {
294        self.confidence >= 0.8
295    }
296}
297
298/// Trait for extracting features from code entities.
299///
300/// This trait defines the interface for all feature extractors in the system.
301/// Extractors are responsible for computing specific features from parsed code entities.
302#[async_trait]
303pub trait FeatureExtractor: Send + Sync {
304    /// Get the name of this extractor
305    fn name(&self) -> &str;
306
307    /// Get the list of features this extractor provides
308    fn features(&self) -> &[FeatureDefinition];
309
310    /// Extract features from an entity
311    async fn extract(
312        &self,
313        entity: &CodeEntity,
314        context: &ExtractionContext,
315    ) -> Result<HashMap<String, f64>>;
316
317    /// Check if this extractor supports the given entity type
318    fn supports_entity(&self, entity: &CodeEntity) -> bool {
319        // Default: support all entities
320        true
321    }
322
323    /// Get the definition of a specific feature
324    fn get_feature_definition(&self, name: &str) -> Option<&FeatureDefinition> {
325        self.features().iter().find(|f| f.name == name)
326    }
327
328    /// Validate that all feature values are within expected ranges
329    fn validate_features(&self, features: &HashMap<String, f64>) -> Result<()> {
330        for (name, &value) in features {
331            if let Some(definition) = self.get_feature_definition(name) {
332                if !definition.is_valid_value(value) {
333                    return Err(ValknutError::validation(format!(
334                        "Feature '{}' value {} is out of range",
335                        name, value
336                    )));
337                }
338            }
339        }
340        Ok(())
341    }
342}
343
344/// Simplified entity representation for feature extraction.
345/// This will be expanded when we implement the full AST module.
346#[derive(Debug, Clone, PartialEq)]
347pub struct CodeEntity {
348    /// Unique identifier
349    pub id: EntityId,
350
351    /// Entity type (function, class, module, etc.)
352    pub entity_type: String,
353
354    /// Entity name
355    pub name: String,
356
357    /// Source file path
358    pub file_path: String,
359
360    /// Line number range
361    pub line_range: Option<(usize, usize)>,
362
363    /// Raw source code
364    pub source_code: String,
365
366    /// Additional properties
367    pub properties: HashMap<String, serde_json::Value>,
368}
369
370/// Factory and builder methods for [`CodeEntity`].
371impl CodeEntity {
372    /// Create a new code entity
373    pub fn new(
374        id: impl Into<EntityId>,
375        entity_type: impl Into<String>,
376        name: impl Into<String>,
377        file_path: impl Into<String>,
378    ) -> Self {
379        Self {
380            id: id.into(),
381            entity_type: entity_type.into(),
382            name: name.into(),
383            file_path: file_path.into(),
384            line_range: None,
385            source_code: String::new(),
386            properties: HashMap::new(),
387        }
388    }
389
390    /// Set the line range for this entity
391    pub fn with_line_range(mut self, start: usize, end: usize) -> Self {
392        self.line_range = Some((start, end));
393        self
394    }
395
396    /// Set the source code for this entity
397    pub fn with_source_code(mut self, source_code: impl Into<String>) -> Self {
398        self.source_code = source_code.into();
399        self
400    }
401
402    /// Add a property to this entity
403    pub fn add_property(&mut self, key: impl Into<String>, value: serde_json::Value) {
404        self.properties.insert(key.into(), value);
405    }
406
407    /// Get the number of lines in this entity
408    pub fn line_count(&self) -> usize {
409        if let Some((start, end)) = self.line_range {
410            (end - start).max(1)
411        } else {
412            self.source_code.lines().count()
413        }
414    }
415}
416
417/// Context provided to feature extractors during extraction
418#[derive(Debug)]
419pub struct ExtractionContext {
420    /// Global configuration
421    pub config: Arc<crate::core::config::ValknutConfig>,
422
423    /// Index of all entities for dependency analysis
424    pub entity_index: HashMap<EntityId, CodeEntity>,
425
426    /// Language-specific parser information
427    pub language: String,
428
429    /// Additional context data
430    pub context_data: HashMap<String, serde_json::Value>,
431
432    /// Optional pre-filter of candidate similarity peers per entity
433    pub candidate_partitions: Option<Arc<HashMap<EntityId, Vec<EntityId>>>>,
434}
435
436/// Factory and configuration methods for [`ExtractionContext`].
437impl ExtractionContext {
438    /// Create a new extraction context
439    pub fn new(
440        config: Arc<crate::core::config::ValknutConfig>,
441        language: impl Into<String>,
442    ) -> Self {
443        Self {
444            config,
445            entity_index: HashMap::new(),
446            language: language.into(),
447            context_data: HashMap::new(),
448            candidate_partitions: None,
449        }
450    }
451
452    /// Add an entity to the index
453    pub fn add_entity(&mut self, entity: CodeEntity) {
454        self.entity_index.insert(entity.id.clone(), entity);
455    }
456
457    /// Get an entity from the index
458    pub fn get_entity(&self, id: &str) -> Option<&CodeEntity> {
459        self.entity_index.get(id)
460    }
461
462    /// Add context data
463    pub fn add_context_data(&mut self, key: impl Into<String>, value: serde_json::Value) {
464        self.context_data.insert(key.into(), value);
465    }
466
467    /// Attach clique partitions for downstream similarity detectors.
468    pub fn with_candidate_partitions(
469        mut self,
470        partitions: Arc<HashMap<EntityId, Vec<EntityId>>>,
471    ) -> Self {
472        self.candidate_partitions = Some(partitions);
473        self
474    }
475}
476
477/// Base feature extractor with common functionality
478pub struct BaseFeatureExtractor {
479    /// Name of this extractor
480    name: String,
481
482    /// Feature definitions provided by this extractor
483    feature_definitions: Vec<FeatureDefinition>,
484}
485
486/// Factory and helper methods for [`BaseFeatureExtractor`].
487impl BaseFeatureExtractor {
488    /// Create a new base feature extractor
489    pub fn new(name: impl Into<String>) -> Self {
490        Self {
491            name: name.into(),
492            feature_definitions: Vec::new(),
493        }
494    }
495
496    /// Add a feature definition to this extractor
497    pub fn add_feature(&mut self, definition: FeatureDefinition) {
498        self.feature_definitions.push(definition);
499    }
500
501    /// Extract a feature value safely with error handling
502    pub fn safe_extract<F>(&self, feature_name: &str, extraction_func: F) -> f64
503    where
504        F: FnOnce() -> Result<f64>,
505    {
506        match extraction_func() {
507            Ok(value) => {
508                // Validate and clamp the value
509                if let Some(definition) = self.get_feature_definition(feature_name) {
510                    definition.clamp_value(value)
511                } else {
512                    value
513                }
514            }
515            Err(_) => {
516                // Return default value on error
517                self.get_feature_definition(feature_name)
518                    .map(|def| def.default_value)
519                    .unwrap_or(0.0)
520            }
521        }
522    }
523}
524
525/// [`FeatureExtractor`] implementation providing default/empty extraction.
526#[async_trait]
527impl FeatureExtractor for BaseFeatureExtractor {
528    /// Returns the extractor name.
529    fn name(&self) -> &str {
530        &self.name
531    }
532
533    /// Returns the list of feature definitions.
534    fn features(&self) -> &[FeatureDefinition] {
535        &self.feature_definitions
536    }
537
538    /// Default implementation that returns empty features.
539    async fn extract(
540        &self,
541        _entity: &CodeEntity,
542        _context: &ExtractionContext,
543    ) -> Result<HashMap<String, f64>> {
544        // Default implementation returns empty features
545        Ok(HashMap::new())
546    }
547}
548
549/// Registry for managing feature extractors
550#[derive(Default)]
551pub struct FeatureExtractorRegistry {
552    /// Registered extractors
553    extractors: HashMap<String, Arc<dyn FeatureExtractor>>,
554
555    /// All available feature definitions
556    feature_definitions: HashMap<String, FeatureDefinition>,
557}
558
559/// Factory, registration, and extraction methods for [`FeatureExtractorRegistry`].
560impl FeatureExtractorRegistry {
561    /// Create a new registry
562    pub fn new() -> Self {
563        Self::default()
564    }
565
566    /// Register a feature extractor
567    pub fn register(&mut self, extractor: Arc<dyn FeatureExtractor>) {
568        let name = extractor.name().to_string();
569
570        // Add feature definitions from this extractor
571        for feature_def in extractor.features() {
572            self.feature_definitions
573                .insert(feature_def.name.clone(), feature_def.clone());
574        }
575
576        self.extractors.insert(name, extractor);
577    }
578
579    /// Get an extractor by name
580    pub fn get_extractor(&self, name: &str) -> Option<Arc<dyn FeatureExtractor>> {
581        self.extractors.get(name).cloned()
582    }
583
584    /// Get all registered extractors
585    pub fn get_all_extractors(&self) -> impl Iterator<Item = &Arc<dyn FeatureExtractor>> {
586        self.extractors.values()
587    }
588
589    /// Get extractors that support a specific entity type
590    pub fn get_compatible_extractors(&self, entity: &CodeEntity) -> Vec<Arc<dyn FeatureExtractor>> {
591        self.extractors
592            .values()
593            .filter(|extractor| extractor.supports_entity(entity))
594            .cloned()
595            .collect()
596    }
597
598    /// Get a feature definition by name
599    pub fn get_feature_definition(&self, name: &str) -> Option<&FeatureDefinition> {
600        self.feature_definitions.get(name)
601    }
602
603    /// Get all feature definitions
604    pub fn get_all_feature_definitions(&self) -> impl Iterator<Item = &FeatureDefinition> {
605        self.feature_definitions.values()
606    }
607
608    /// Extract features for an entity using all compatible extractors
609    pub async fn extract_all_features(
610        &self,
611        entity: &CodeEntity,
612        context: &ExtractionContext,
613    ) -> Result<FeatureVector> {
614        let mut feature_vector = FeatureVector::new(entity.id.clone());
615
616        // Get compatible extractors
617        let extractors = self.get_compatible_extractors(entity);
618
619        // Extract features from each extractor
620        for extractor in extractors {
621            match extractor.extract(entity, context).await {
622                Ok(features) => {
623                    for (name, value) in features {
624                        feature_vector.add_feature(name, value);
625                    }
626                }
627                Err(e) => {
628                    // Log error but continue with other extractors
629                    tracing::warn!(
630                        "Feature extraction failed for extractor '{}' on entity '{}': {}",
631                        extractor.name(),
632                        entity.id,
633                        e
634                    );
635                }
636            }
637        }
638
639        Ok(feature_vector)
640    }
641}