impl DataScienceAnalyzer {
#[provable_contracts_macros::contract("pmat-core.yaml", equation = "check_compliance")]
pub fn cluster_findings(&self, findings: &mut [Finding]) -> Vec<FindingCluster> {
if findings.is_empty() || findings.len() < self.k_clusters {
if !findings.is_empty() {
for finding in findings.iter_mut() {
finding.cluster_id = Some(0);
}
return vec![FindingCluster {
id: 0,
size: findings.len(),
primary_category: findings
.first()
.map(|f| f.category.clone())
.unwrap_or_default(),
cohesion: 1.0,
description: "All findings".to_string(),
finding_ids: findings.iter().map(|f| f.id.clone()).collect(),
}];
}
return Vec::new();
}
let vectors: Vec<Vec<f32>> = findings
.iter()
.map(|f| self.finding_to_features(f))
.collect();
let rows = vectors.len();
let cols = vectors[0].len();
let data: Vec<f32> = vectors.iter().flat_map(|v| v.iter().copied()).collect();
let matrix = match Matrix::from_vec(rows, cols, data) {
Ok(m) => m,
Err(_) => return Vec::new(),
};
let mut kmeans = KMeans::new(self.k_clusters).with_max_iter(100);
if kmeans.fit(&matrix).is_err() {
return Vec::new();
}
let labels = kmeans.predict(&matrix);
for (finding, &label) in findings.iter_mut().zip(labels.iter()) {
finding.cluster_id = Some(label);
}
self.build_cluster_summaries(findings, &labels)
}
#[allow(clippy::cast_possible_truncation)]
fn finding_to_features(&self, finding: &Finding) -> Vec<f32> {
let mut features = vec![0.0f32; 6];
features[0] = match finding.severity {
super::types::Severity::Low => 0.0,
super::types::Severity::Medium => 1.0,
super::types::Severity::High => 2.0,
super::types::Severity::Critical => 3.0,
};
features[1] = finding.confidence;
features[2] = (finding.category.len() % 10) as f32;
features[3] = (finding.location.file.to_string_lossy().len() % 20) as f32;
features[4] = (finding.location.line as f32 / 10000.0).min(1.0);
features[5] = if finding.fix_suggestion.is_some() {
1.0
} else {
0.0
};
features
}
fn build_cluster_summaries(
&self,
findings: &[Finding],
labels: &[usize],
) -> Vec<FindingCluster> {
let mut cluster_findings: HashMap<usize, Vec<&Finding>> = HashMap::new();
for (finding, &label) in findings.iter().zip(labels.iter()) {
cluster_findings.entry(label).or_default().push(finding);
}
cluster_findings
.into_iter()
.map(|(id, cluster_items)| {
let mut category_counts: HashMap<&str, usize> = HashMap::new();
for finding in &cluster_items {
*category_counts.entry(&finding.category).or_insert(0) += 1;
}
let primary_category = category_counts
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(cat, _)| cat.to_string())
.unwrap_or_default();
let unique_categories: std::collections::HashSet<_> =
cluster_items.iter().map(|f| &f.category).collect();
let cohesion = 1.0 / (unique_categories.len() as f64).max(1.0);
FindingCluster {
id,
size: cluster_items.len(),
primary_category: primary_category.clone(),
cohesion,
description: format!("{} Issues", primary_category),
finding_ids: cluster_items.iter().map(|f| f.id.clone()).collect(),
}
})
.collect()
}
}