use crate::models::{Finding, Severity};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct Features {
pub values: Vec<f32>,
}
impl Features {
pub fn new(values: Vec<f32>) -> Self {
Self { values }
}
pub fn len(&self) -> usize {
self.values.len()
}
pub fn is_empty(&self) -> bool {
self.values.is_empty()
}
}
pub struct FeatureExtractor {
detector_vocab: HashMap<String, usize>,
code_patterns: Vec<&'static str>,
fp_path_patterns: Vec<&'static str>,
tp_path_patterns: Vec<&'static str>,
}
impl FeatureExtractor {
pub fn new() -> Self {
let detectors = vec![
"SQLInjectionDetector",
"CommandInjectionDetector",
"PathTraversalDetector",
"XssDetector",
"InsecureCryptoDetector",
"TorchLoadUnsafeDetector",
"DeadCodeDetector",
"UnreachableCodeDetector",
"LongMethodsDetector",
"GodClassDetector",
"FeatureEnvyDetector",
"ComplexitySpike",
"MagicNumbersDetector",
"NPlusOneDetector",
"InconsistentReturnsDetector",
];
let detector_vocab: HashMap<String, usize> = detectors
.into_iter()
.enumerate()
.map(|(i, d)| (d.to_string(), i))
.collect();
let code_patterns = vec![
"test",
"mock",
"stub",
"fake",
"fixture",
"spec",
"assert",
"expect",
"should",
"config",
"env",
"settings",
"constant",
"generated",
"auto-generated",
"@generated",
"orm",
"query",
"model",
"schema",
"user_input",
"request",
"params",
"body",
"exec",
"eval",
"shell",
"system",
"password",
"secret",
"token",
"key",
];
let fp_path_patterns = vec![
"test",
"tests",
"spec",
"specs",
"__test__",
"__tests__",
"fixture",
"fixtures",
"mock",
"mocks",
"example",
"examples",
"demo",
"sample",
"vendor",
"node_modules",
"generated",
"dist",
"build",
"scripts",
"script",
"tools",
"tool",
"bin",
"benchmark",
"benchmarks",
"docs",
"documentation",
"fix_agent",
"helper",
"util",
"utils",
"cli/",
"/cli",
];
let tp_path_patterns = vec![
"src",
"lib",
"app",
"api",
"routes",
"handlers",
"controller",
"service",
"auth",
"security",
];
Self {
detector_vocab,
code_patterns,
fp_path_patterns,
tp_path_patterns,
}
}
pub fn extract(&self, finding: &Finding) -> Features {
let mut features = Vec::new();
let mut detector_onehot = vec![0.0f32; self.detector_vocab.len()];
if let Some(&idx) = self.detector_vocab.get(&finding.detector) {
detector_onehot[idx] = 1.0;
}
features.extend(detector_onehot);
features.push(if finding.severity == Severity::Critical {
1.0
} else {
0.0
});
features.push(if finding.severity == Severity::High {
1.0
} else {
0.0
});
features.push(if finding.severity == Severity::Medium {
1.0
} else {
0.0
});
features.push(if finding.severity == Severity::Low {
1.0
} else {
0.0
});
let desc_lower = finding.description.to_lowercase();
let title_lower = finding.title.to_lowercase();
let combined = format!("{} {}", desc_lower, title_lower);
for pattern in &self.code_patterns {
features.push(if combined.contains(pattern) { 1.0 } else { 0.0 });
}
let path = finding
.affected_files
.first()
.map(|p| p.to_string_lossy().to_lowercase())
.unwrap_or_default();
let fp_path_score: f32 = self
.fp_path_patterns
.iter()
.filter(|p| path.contains(*p))
.count() as f32;
features.push(fp_path_score);
let tp_path_score: f32 = self
.tp_path_patterns
.iter()
.filter(|p| path.contains(*p))
.count() as f32;
features.push(tp_path_score);
let line_start = finding.line_start.unwrap_or(1);
let line_end = finding.line_end.unwrap_or(line_start);
let line_span = line_end.saturating_sub(line_start).saturating_add(1);
features.push((line_span as f32).min(100.0) / 100.0);
features.push((finding.description.len() as f32).min(1000.0) / 1000.0);
features.push(if finding.suggested_fix.is_some() {
1.0
} else {
0.0
});
features.push(if finding.cwe_id.is_some() { 1.0 } else { 0.0 });
Features::new(features)
}
pub fn feature_count(&self) -> usize {
self.detector_vocab.len() + 4 + self.code_patterns.len() + 2 + 4 }
}
impl Default for FeatureExtractor {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_feature_extraction() {
let extractor = FeatureExtractor::new();
let finding = Finding {
id: "test".into(),
detector: "SQLInjectionDetector".into(),
severity: Severity::High,
title: "SQL Injection in query".into(),
description: "User input passed to exec()".into(),
affected_files: vec![PathBuf::from("src/api/users.py")],
line_start: Some(10),
line_end: Some(15),
suggested_fix: Some("Use parameterized queries".into()),
cwe_id: Some("CWE-89".into()),
..Default::default()
};
let features = extractor.extract(&finding);
assert_eq!(features.len(), extractor.feature_count());
assert_eq!(features.values[0], 1.0);
let severity_start = extractor.detector_vocab.len();
assert_eq!(features.values[severity_start + 1], 1.0); }
}