use std::collections::HashSet;
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct CommitFeatures {
pub defect_category: u8,
pub files_changed: f32,
pub lines_added: f32,
pub lines_deleted: f32,
pub complexity_delta: f32,
pub timestamp: f64,
pub hour_of_day: u8,
pub day_of_week: u8,
}
impl Default for CommitFeatures {
fn default() -> Self {
Self {
defect_category: 0,
files_changed: 0.0,
lines_added: 0.0,
lines_deleted: 0.0,
complexity_delta: 0.0,
timestamp: 0.0,
hour_of_day: 12,
day_of_week: 0,
}
}
}
impl CommitFeatures {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn to_vec(&self) -> Vec<f32> {
vec![
f32::from(self.defect_category),
self.files_changed,
self.lines_added,
self.lines_deleted,
self.complexity_delta,
self.timestamp as f32,
f32::from(self.hour_of_day),
f32::from(self.day_of_week),
]
}
#[must_use]
pub fn from_vec(v: &[f32]) -> Self {
assert!(v.len() >= 8, "Feature vector must have at least 8 elements");
Self {
defect_category: v[0] as u8,
files_changed: v[1],
lines_added: v[2],
lines_deleted: v[3],
complexity_delta: v[4],
timestamp: f64::from(v[5]),
hour_of_day: v[6] as u8,
day_of_week: v[7] as u8,
}
}
#[must_use]
pub fn churn(&self) -> f32 {
self.lines_added + self.lines_deleted
}
#[must_use]
pub fn net_change(&self) -> f32 {
self.lines_added - self.lines_deleted
}
#[must_use]
pub fn is_fix(&self) -> bool {
self.defect_category == 1
}
}
#[derive(Debug, Clone, Default)]
pub struct CommitDiff {
pub files_changed: u32,
pub lines_added: u32,
pub lines_deleted: u32,
pub timestamp: u64,
pub message: String,
}
impl CommitDiff {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_files_changed(mut self, n: u32) -> Self {
self.files_changed = n;
self
}
#[must_use]
pub fn with_lines_added(mut self, n: u32) -> Self {
self.lines_added = n;
self
}
#[must_use]
pub fn with_lines_deleted(mut self, n: u32) -> Self {
self.lines_deleted = n;
self
}
#[must_use]
pub fn with_timestamp(mut self, ts: u64) -> Self {
self.timestamp = ts;
self
}
#[must_use]
pub fn with_message(mut self, msg: impl Into<String>) -> Self {
self.message = msg.into();
self
}
}
#[derive(Debug, Clone)]
pub struct CodeFeatureExtractor {
bug_keywords: HashSet<String>,
security_keywords: HashSet<String>,
perf_keywords: HashSet<String>,
refactor_keywords: HashSet<String>,
complexity_factor: f32,
}
impl Default for CodeFeatureExtractor {
fn default() -> Self {
Self::new()
}
}
impl CodeFeatureExtractor {
#[must_use]
pub fn new() -> Self {
let bug_keywords: HashSet<_> = [
"fix",
"bug",
"error",
"issue",
"crash",
"fault",
"defect",
"problem",
"wrong",
"broken",
"fail",
"mistake",
"incorrect",
]
.iter()
.map(|s| (*s).to_string())
.collect();
let security_keywords: HashSet<_> = [
"security",
"vulnerability",
"cve",
"exploit",
"injection",
"xss",
"csrf",
"auth",
"permission",
"sanitize",
"escape",
"unsafe",
]
.iter()
.map(|s| (*s).to_string())
.collect();
let perf_keywords: HashSet<_> = [
"performance",
"perf",
"optimize",
"speed",
"fast",
"slow",
"memory",
"cache",
"efficient",
"latency",
"throughput",
]
.iter()
.map(|s| (*s).to_string())
.collect();
let refactor_keywords: HashSet<_> = [
"refactor",
"clean",
"rename",
"move",
"reorganize",
"restructure",
"simplify",
"extract",
"inline",
]
.iter()
.map(|s| (*s).to_string())
.collect();
Self {
bug_keywords,
security_keywords,
perf_keywords,
refactor_keywords,
complexity_factor: 10.0, }
}
#[must_use]
pub fn with_complexity_factor(mut self, factor: f32) -> Self {
self.complexity_factor = factor.max(1.0);
self
}
pub fn add_bug_keywords(&mut self, keywords: &[&str]) {
for kw in keywords {
self.bug_keywords.insert((*kw).to_string());
}
}
pub fn add_security_keywords(&mut self, keywords: &[&str]) {
for kw in keywords {
self.security_keywords.insert((*kw).to_string());
}
}
#[must_use]
pub fn extract(&self, diff: &CommitDiff) -> CommitFeatures {
let defect_category = self.classify_commit(&diff.message);
let complexity_delta = self.estimate_complexity_delta(diff);
let (hour_of_day, day_of_week) = self.extract_time_features(diff.timestamp);
CommitFeatures {
defect_category,
files_changed: diff.files_changed as f32,
lines_added: diff.lines_added as f32,
lines_deleted: diff.lines_deleted as f32,
complexity_delta,
timestamp: diff.timestamp as f64,
hour_of_day,
day_of_week,
}
}
#[must_use]
pub fn extract_batch(&self, diffs: &[CommitDiff]) -> Vec<CommitFeatures> {
diffs.iter().map(|d| self.extract(d)).collect()
}
fn classify_commit(&self, message: &str) -> u8 {
let lower = message.to_lowercase();
let words: Vec<&str> = lower.split_whitespace().collect();
for word in &words {
if self.security_keywords.contains(*word) {
return 2; }
}
for word in &words {
if self.bug_keywords.contains(*word) {
return 1; }
}
for word in &words {
if self.perf_keywords.contains(*word) {
return 3; }
}
for word in &words {
if self.refactor_keywords.contains(*word) {
return 4; }
}
0 }
fn estimate_complexity_delta(&self, diff: &CommitDiff) -> f32 {
let net = diff.lines_added as f32 - diff.lines_deleted as f32;
net / self.complexity_factor
}
#[allow(clippy::unused_self)]
fn extract_time_features(&self, timestamp: u64) -> (u8, u8) {
let seconds_in_day: u64 = 86400;
let seconds_in_hour: u64 = 3600;
let days_since_epoch = timestamp / seconds_in_day;
let day_of_week = ((days_since_epoch + 4) % 7) as u8;
let seconds_today = timestamp % seconds_in_day;
let hour_of_day = (seconds_today / seconds_in_hour) as u8;
(hour_of_day, day_of_week)
}
#[must_use]
pub fn normalize(&self, features: &CommitFeatures, stats: &FeatureStats) -> CommitFeatures {
CommitFeatures {
defect_category: features.defect_category, files_changed: Self::normalize_value(features.files_changed, stats.files_changed_max),
lines_added: Self::normalize_value(features.lines_added, stats.lines_added_max),
lines_deleted: Self::normalize_value(features.lines_deleted, stats.lines_deleted_max),
complexity_delta: Self::normalize_value(
features.complexity_delta,
stats.complexity_max,
),
timestamp: features.timestamp, hour_of_day: features.hour_of_day,
day_of_week: features.day_of_week,
}
}
fn normalize_value(value: f32, max: f32) -> f32 {
if max <= 0.0 {
0.0
} else {
(value / max).clamp(0.0, 1.0)
}
}
}
#[derive(Debug, Clone, Copy, Default)]
pub struct FeatureStats {
pub files_changed_max: f32,
pub lines_added_max: f32,
pub lines_deleted_max: f32,
pub complexity_max: f32,
}
impl FeatureStats {
#[must_use]
pub fn from_features(features: &[CommitFeatures]) -> Self {
let mut stats = Self::default();
for f in features {
stats.files_changed_max = stats.files_changed_max.max(f.files_changed);
stats.lines_added_max = stats.lines_added_max.max(f.lines_added);
stats.lines_deleted_max = stats.lines_deleted_max.max(f.lines_deleted);
stats.complexity_max = stats.complexity_max.max(f.complexity_delta.abs());
}
stats
}
}
#[cfg(test)]
#[path = "code_features_tests.rs"]
mod tests;