pub mod architectural;
pub mod compound;
pub mod dependency_chain;
pub mod embedding_scorer;
pub mod embeddings;
pub mod relational;
pub mod structural;
pub mod token_level;
use crate::detectors::function_context::FunctionContextMap;
use crate::graph::GraphQueryExt;
use crate::models::Severity;
#[derive(Debug, Clone)]
pub struct LevelScore {
pub level: Level,
pub z_score: f64,
pub threshold: f64,
pub is_surprising: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Level {
Token, Structural, DependencyChain, Relational, Architectural, }
impl Level {
pub fn label(&self) -> &'static str {
match self {
Level::Token => "L1 Token",
Level::Structural => "L2 Structural",
Level::DependencyChain => "L1.5 Dependency",
Level::Relational => "L3 Relational",
Level::Architectural => "L4 Architectural",
}
}
}
#[derive(Debug, Clone)]
pub struct CompoundScore {
pub level_scores: Vec<LevelScore>,
pub concordance: usize,
pub compound_surprise: f64,
pub severity: Severity,
}
use std::collections::HashMap;
use std::sync::Arc;
use tracing::debug;
pub struct PredictiveCodingEngine {
scores: HashMap<String, CompoundScore>,
level_count: usize,
}
struct TrainedModels {
token_scorer: token_level::TokenLevelScorer,
structural_scorer: structural::StructuralScorer,
func_features: Vec<(String, Vec<f64>)>,
chain_scorer: dependency_chain::DependencyChainScorer,
relational_scorer: relational::RelationalScorer,
arch_scorer: architectural::ArchitecturalScorer,
}
impl PredictiveCodingEngine {
pub fn new() -> Self {
Self {
scores: HashMap::new(),
level_count: 5,
}
}
pub fn level_count(&self) -> usize {
self.level_count
}
pub fn train_and_score(
&mut self,
graph: &dyn crate::graph::GraphQuery,
files: &dyn crate::detectors::file_provider::FileProvider,
contexts: &FunctionContextMap,
cached_embeddings: Option<&embedding_scorer::CachedEmbeddings>,
) {
let all_functions = graph.get_functions_shared();
if all_functions.len() < 20 {
return; }
let trained = self.train_models(graph, files, contexts, &all_functions, cached_embeddings);
self.score_functions(graph, files, &all_functions, contexts, &trained);
}
fn train_models(
&self,
graph: &dyn crate::graph::GraphQuery,
files: &dyn crate::detectors::file_provider::FileProvider,
contexts: &FunctionContextMap,
functions: &[crate::graph::CodeNode],
cached_embeddings: Option<&embedding_scorer::CachedEmbeddings>,
) -> TrainedModels {
let i = graph.interner();
let repo_path = files.repo_path();
let mut token_scorer = token_level::TokenLevelScorer::new();
let extensions: &[&str] = &[
"rs", "py", "ts", "tsx", "js", "jsx", "go", "java", "c", "cpp", "cc", "h", "hpp", "cs",
];
const L1_TOKEN_SATURATION: usize = 50_000;
let mut lang_tokens: HashMap<&str, usize> = HashMap::new();
for path in files.files_with_extensions(extensions) {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
if *lang_tokens.get(ext).unwrap_or(&0) >= L1_TOKEN_SATURATION {
continue;
}
if let Some(content) = files.content(path) {
token_scorer.train_file(&content, ext);
*lang_tokens.entry(ext).or_insert(0) += content.split_whitespace().count();
}
}
let mut feature_vecs: Vec<Vec<f64>> = Vec::with_capacity(functions.len());
let mut func_features: Vec<(String, Vec<f64>)> = Vec::with_capacity(functions.len());
for func in functions.iter() {
let params = func.param_count_opt().unwrap_or(0);
let complexity = func.complexity_opt().unwrap_or(1);
let nesting = func.get_i64("maxNesting").unwrap_or(0);
let loc = func.loc();
let returns = func.get_i64("returnCount").unwrap_or(1);
let feat = structural::extract_structural_features_raw(
params, complexity, nesting, loc, returns,
);
feature_vecs.push(feat.clone());
func_features.push((func.qn(i).to_string(), feat));
}
let structural_scorer = structural::StructuralScorer::from_features(&feature_vecs);
let calls: Vec<(String, String)> = graph
.get_calls()
.into_iter()
.map(|(a, b)| (i.resolve(a).to_string(), i.resolve(b).to_string()))
.collect();
let chains = dependency_chain::extract_dependency_chains_bounded(&calls, 4, 10_000);
let mut chain_scorer = dependency_chain::DependencyChainScorer::new();
let fn_by_name: HashMap<&str, &crate::graph::CodeNode> =
functions.iter().map(|f| (f.qn(i), f)).collect();
let mut lines_cache: HashMap<&str, Vec<String>> = HashMap::new();
for chain in &chains {
let chain_lines: Vec<String> = chain
.iter()
.filter_map(|qn| fn_by_name.get(qn.as_str()).copied())
.filter_map(|f| {
let cached_lines = lines_cache.entry(f.path(i)).or_insert_with(|| {
let path = repo_path.join(f.path(i));
let content = files
.content(&path)
.unwrap_or_else(|| Arc::new(String::new()));
content.lines().map(|l| l.to_string()).collect()
});
if cached_lines.is_empty() {
return None;
}
let start = f.line_start.saturating_sub(1) as usize;
let end = (f.line_end as usize).min(cached_lines.len());
if start < end {
Some(cached_lines[start].clone())
} else {
None
}
})
.filter(|s| !s.is_empty())
.collect();
if chain_lines.is_empty() {
continue;
}
let line_refs: Vec<&str> = chain_lines.iter().map(|s| s.as_str()).collect();
if let Some(model) = token_scorer.models.values().find(|m| m.is_confident()) {
let surprisal = dependency_chain::chain_surprisal(model, &line_refs);
chain_scorer.record_chain(chain, surprisal);
}
}
drop(lines_cache); debug!(
"[predictive] L1.5 extracted {} chains from {} calls",
chains.len(),
calls.len()
);
let relational_scorer = if let Some(cached) = cached_embeddings {
tracing::debug!(
"[predictive] L3 using quantized node2vec embeddings ({} vectors)",
cached.entries.len()
);
relational::RelationalScorer::Embedding(
embedding_scorer::EmbeddingRelationalScorer::from_cache(cached, 10),
)
} else {
tracing::debug!("[predictive] L3 falling back to Mahalanobis (no cached embeddings)");
relational::RelationalScorer::Mahalanobis(
relational::GraphRelationalScorer::from_contexts(contexts),
)
};
let mut arch_scorer = architectural::ArchitecturalScorer::new();
let mut module_funcs: HashMap<String, Vec<&crate::graph::CodeNode>> = HashMap::new();
for func in functions.iter() {
let module = func
.path(i)
.rsplit_once('/')
.map(|(dir, _)| dir)
.unwrap_or("root");
module_funcs
.entry(module.to_string())
.or_default()
.push(func);
}
for (module_path, funcs) in &module_funcs {
let count = funcs.len().max(1) as f64;
let avg_fan_in = funcs
.iter()
.map(|f| graph.call_fan_in(f.qn(i)) as f64)
.sum::<f64>()
/ count;
let avg_fan_out = funcs
.iter()
.map(|f| graph.call_fan_out(f.qn(i)) as f64)
.sum::<f64>()
/ count;
arch_scorer.add_module(
module_path,
architectural::ModuleProfile {
avg_fan_in,
avg_fan_out,
internal_cohesion: 0.5, external_coupling: 0.5, entity_count: funcs.len(),
smell_type_count: 0, },
);
}
arch_scorer.finalize();
TrainedModels {
token_scorer,
structural_scorer,
func_features,
chain_scorer,
relational_scorer,
arch_scorer,
}
}
fn score_functions(
&mut self,
graph: &dyn crate::graph::GraphQuery,
files: &dyn crate::detectors::file_provider::FileProvider,
functions: &[crate::graph::CodeNode],
contexts: &FunctionContextMap,
models: &TrainedModels,
) {
let i = graph.interner();
let repo_path = files.repo_path();
const MAX_SCORED: usize = 5_000;
let scored_indices: Vec<usize> = if functions.len() > MAX_SCORED {
let mut indexed_dists: Vec<(usize, f64)> = models
.func_features
.iter()
.enumerate()
.map(|(i, (_, feat))| (i, models.structural_scorer.mahalanobis_distance(feat)))
.collect();
indexed_dists
.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
indexed_dists.truncate(MAX_SCORED);
indexed_dists.into_iter().map(|(i, _)| i).collect()
} else {
(0..functions.len()).collect()
};
let thresholds = compound::default_thresholds();
struct RawScores {
token: f64,
structural: f64,
dep_chain: f64,
relational: f64,
architectural: f64,
}
let mut raw_scores: Vec<(String, RawScores)> = Vec::with_capacity(scored_indices.len());
let mut file_cache: HashMap<&str, Option<Arc<String>>> = HashMap::new();
for &idx in &scored_indices {
let func = &functions[idx];
let ext = func.path(i).rsplit('.').next().unwrap_or("rs");
let has_model = models
.token_scorer
.models
.get(ext)
.map(|m| m.is_confident())
.unwrap_or(false);
let token_score = if has_model {
let content = file_cache.entry(func.path(i)).or_insert_with(|| {
let path = repo_path.join(func.path(i));
files.content(&path)
});
if let Some(content) = content {
let lines: Vec<&str> = content.lines().collect();
let start = func.line_start.saturating_sub(1) as usize;
let end = (func.line_end as usize).min(lines.len());
if start < end && end - start >= 4 {
models.token_scorer.score_function(&lines[start..end], ext)
} else {
0.0
}
} else {
0.0
}
} else {
0.0
};
let structural_score = models
.func_features
.get(idx)
.map(|(_, feat)| models.structural_scorer.mahalanobis_distance(feat))
.unwrap_or(0.0);
let dep_score = models.chain_scorer.score(func.qn(i));
let relational_score = models.relational_scorer.distance(func.qn(i), contexts);
let module = func
.path(i)
.rsplit_once('/')
.map(|(dir, _)| dir)
.unwrap_or("root");
let arch_score = models.arch_scorer.module_distance(module);
raw_scores.push((
func.qn(i).to_string(),
RawScores {
token: token_score,
structural: structural_score,
dep_chain: dep_score,
relational: relational_score,
architectural: arch_score,
},
));
}
let token_raw: Vec<f64> = raw_scores.iter().map(|(_, r)| r.token).collect();
let struct_raw: Vec<f64> = raw_scores.iter().map(|(_, r)| r.structural).collect();
let dep_raw: Vec<f64> = raw_scores.iter().map(|(_, r)| r.dep_chain).collect();
let rel_raw: Vec<f64> = raw_scores.iter().map(|(_, r)| r.relational).collect();
let arch_raw: Vec<f64> = raw_scores.iter().map(|(_, r)| r.architectural).collect();
let token_z = z_scores_from_raw(&token_raw);
let struct_z = z_scores_from_raw(&struct_raw);
let dep_z = z_scores_from_raw(&dep_raw);
let rel_z = z_scores_from_raw(&rel_raw);
let arch_z = z_scores_from_raw(&arch_raw);
let mut all_z_scores: HashMap<Level, Vec<f64>> = HashMap::new();
all_z_scores.insert(Level::Token, token_z.clone());
all_z_scores.insert(Level::Structural, struct_z.clone());
all_z_scores.insert(Level::DependencyChain, dep_z.clone());
all_z_scores.insert(Level::Relational, rel_z.clone());
all_z_scores.insert(Level::Architectural, arch_z.clone());
let weights = compound::compute_precision_weights(&all_z_scores);
for (i, (qn, _)) in raw_scores.iter().enumerate() {
let levels = vec![
LevelScore {
level: Level::Token,
z_score: token_z[i],
threshold: *thresholds.get(&Level::Token).unwrap_or(&2.5),
is_surprising: token_z[i] > *thresholds.get(&Level::Token).unwrap_or(&2.5),
},
LevelScore {
level: Level::Structural,
z_score: struct_z[i],
threshold: *thresholds.get(&Level::Structural).unwrap_or(&2.0),
is_surprising: struct_z[i]
> *thresholds.get(&Level::Structural).unwrap_or(&2.0),
},
LevelScore {
level: Level::DependencyChain,
z_score: dep_z[i],
threshold: *thresholds.get(&Level::DependencyChain).unwrap_or(&2.0),
is_surprising: dep_z[i]
> *thresholds.get(&Level::DependencyChain).unwrap_or(&2.0),
},
LevelScore {
level: Level::Relational,
z_score: rel_z[i],
threshold: *thresholds.get(&Level::Relational).unwrap_or(&1.5),
is_surprising: rel_z[i] > *thresholds.get(&Level::Relational).unwrap_or(&1.5),
},
LevelScore {
level: Level::Architectural,
z_score: arch_z[i],
threshold: *thresholds.get(&Level::Architectural).unwrap_or(&2.0),
is_surprising: arch_z[i]
> *thresholds.get(&Level::Architectural).unwrap_or(&2.0),
},
];
let score = compound::score_entity(levels, &weights);
if score.concordance >= 1 {
self.scores.insert(qn.clone(), score);
}
}
}
pub fn get_surprising_entities(&self, min_concordance: usize) -> Vec<(&str, &CompoundScore)> {
let mut results: Vec<(&str, &CompoundScore)> = self
.scores
.iter()
.filter(|(_, s)| s.concordance >= min_concordance)
.map(|(k, v)| (k.as_str(), v))
.collect();
results.sort_by(|a, b| {
b.1.compound_surprise
.partial_cmp(&a.1.compound_surprise)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
pub fn get_score(&self, qualified_name: &str) -> Option<&CompoundScore> {
self.scores.get(qualified_name)
}
}
impl Default for PredictiveCodingEngine {
fn default() -> Self {
Self::new()
}
}
fn z_scores_from_raw(values: &[f64]) -> Vec<f64> {
if values.len() < 2 {
return vec![0.0; values.len()];
}
let n = values.len() as f64;
let mean = values.iter().sum::<f64>() / n;
let variance = values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / n;
let std = variance.sqrt();
if std < 1e-10 {
return vec![0.0; values.len()];
}
values.iter().map(|v| (v - mean) / std).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_engine_constructible() {
let engine = PredictiveCodingEngine::new();
assert_eq!(engine.level_count(), 5);
assert!(engine.get_surprising_entities(1).is_empty());
}
#[test]
fn test_engine_empty_graph() {
let graph = crate::graph::builder::GraphBuilder::new().freeze();
let empty_files = crate::detectors::file_provider::MockFileProvider::new(vec![]);
let contexts = std::collections::HashMap::new();
let mut engine = PredictiveCodingEngine::new();
engine.train_and_score(&graph, &empty_files, &contexts, None);
assert!(engine.get_surprising_entities(1).is_empty());
}
#[test]
fn test_z_scores_from_raw_basic() {
let z = z_scores_from_raw(&[0.0, 10.0]);
assert!((z[0] - (-1.0)).abs() < 1e-9);
assert!((z[1] - 1.0).abs() < 1e-9);
}
#[test]
fn test_z_scores_from_raw_constant() {
let z = z_scores_from_raw(&[5.0, 5.0, 5.0]);
for val in &z {
assert!(val.abs() < 1e-9);
}
}
#[test]
fn test_z_scores_from_raw_single_value() {
let z = z_scores_from_raw(&[42.0]);
assert_eq!(z.len(), 1);
assert!(z[0].abs() < 1e-9);
}
#[test]
fn test_z_scores_from_raw_empty() {
let z = z_scores_from_raw(&[]);
assert!(z.is_empty());
}
#[test]
fn test_engine_default() {
let engine = PredictiveCodingEngine::default();
assert_eq!(engine.level_count(), 5);
}
}