use sketch_oxide::quantiles::DDSketch;
use sketch_oxide::{CountMinSketch, HeavyKeeper, ThetaSketch, UltraLogLog};
use xxhash_rust::xxh3::xxh3_64;
pub struct SketchFeatures {
pub token_cardinality: UltraLogLog,
pub token_frequencies: CountMinSketch,
pub complexity_sketch: DDSketch,
pub token_set: ThetaSketch,
}
impl SketchFeatures {
pub fn new() -> Result<Self, String> {
let token_cardinality =
UltraLogLog::new(12).map_err(|e| format!("UltraLogLog init: {e}"))?;
let token_frequencies =
CountMinSketch::new(0.01, 0.01).map_err(|e| format!("CountMinSketch init: {e}"))?;
let complexity_sketch = DDSketch::new(0.01).map_err(|e| format!("DDSketch init: {e}"))?;
let token_set = ThetaSketch::new(12).map_err(|e| format!("ThetaSketch init: {e}"))?;
Ok(Self {
token_cardinality,
token_frequencies,
complexity_sketch,
token_set,
})
}
pub fn from_tokens(tokens: &[&str]) -> Result<Self, String> {
let mut features = Self::new()?;
for &token in tokens {
let hash = xxh3_64(token.as_bytes());
features.token_cardinality.add(&hash);
features.token_frequencies.update(&hash);
let complexity_value = (hash % 1000) as f64 + 1.0;
features.complexity_sketch.add(complexity_value);
features.token_set.update(&hash);
}
Ok(features)
}
pub fn estimated_unique_tokens(&self) -> f64 {
self.token_cardinality.cardinality()
}
pub fn token_frequency(&self, token: &str) -> u64 {
let hash = xxh3_64(token.as_bytes());
self.token_frequencies.estimate(&hash)
}
pub fn median_complexity(&self) -> Option<f64> {
self.complexity_sketch.quantile(0.5)
}
pub fn p99_complexity(&self) -> Option<f64> {
self.complexity_sketch.quantile(0.99)
}
}
pub fn jaccard_similarity(a: &ThetaSketch, b: &ThetaSketch) -> f64 {
let intersection = match a.intersect(b) {
Ok(s) => s,
Err(_) => return 0.0,
};
let union = match a.union(b) {
Ok(s) => s,
Err(_) => return 0.0,
};
let union_est = union.estimate();
if union_est <= 0.0 {
return 0.0;
}
let intersection_est = intersection.estimate();
(intersection_est / union_est).clamp(0.0, 1.0)
}
pub struct ProjectSketchStats {
pub call_frequency: CountMinSketch,
pub heaviest_snippets: HeavyKeeper,
}
impl ProjectSketchStats {
pub fn new() -> Result<Self, String> {
let call_frequency =
CountMinSketch::new(0.001, 0.01).map_err(|e| format!("CountMinSketch init: {e}"))?;
let heaviest_snippets =
HeavyKeeper::new(100, 0.001, 0.01).map_err(|e| format!("HeavyKeeper init: {e}"))?;
Ok(Self {
call_frequency,
heaviest_snippets,
})
}
pub fn record_call(&mut self, function_name: &str) {
let hash = xxh3_64(function_name.as_bytes());
self.call_frequency.update(&hash);
}
pub fn record_snippet(&mut self, snippet: &[u8]) {
self.heaviest_snippets.update(snippet);
}
pub fn estimated_calls(&self, function_name: &str) -> u64 {
let hash = xxh3_64(function_name.as_bytes());
self.call_frequency.estimate(&hash)
}
pub fn top_snippets(&self) -> Vec<(u64, u32)> {
self.heaviest_snippets.top_k()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sketch_features_new() {
let features = SketchFeatures::new();
assert!(features.is_ok());
}
#[test]
fn test_sketch_features_from_empty_tokens() {
let features = SketchFeatures::from_tokens(&[]).unwrap();
assert!(features.estimated_unique_tokens() < 1.0);
}
#[test]
fn test_sketch_features_from_tokens_cardinality() {
let tokens = vec!["fn", "main", "(", ")", "{", "let", "x", "=", "1", ";", "}"];
let features = SketchFeatures::from_tokens(&tokens).unwrap();
let est = features.estimated_unique_tokens();
assert!(
(5.0..=20.0).contains(&est),
"Expected cardinality near 11, got {est}"
);
}
#[test]
fn test_sketch_features_token_frequency() {
let tokens = vec!["let", "let", "let", "x", "y"];
let features = SketchFeatures::from_tokens(&tokens).unwrap();
let freq = features.token_frequency("let");
assert!(freq >= 3, "Expected frequency >= 3 for 'let', got {freq}");
let freq_x = features.token_frequency("x");
assert!(freq_x >= 1, "Expected frequency >= 1 for 'x', got {freq_x}");
}
#[test]
fn test_sketch_features_complexity_quantiles() {
let tokens: Vec<&str> = (0..100).map(|_| "tok").collect();
let features = SketchFeatures::from_tokens(&tokens).unwrap();
let median = features.median_complexity();
assert!(median.is_some(), "Expected median to be Some");
let p99 = features.p99_complexity();
assert!(p99.is_some(), "Expected p99 to be Some");
}
#[test]
fn test_jaccard_similarity_identical_sets() {
let tokens = vec!["a", "b", "c", "d", "e"];
let a = SketchFeatures::from_tokens(&tokens).unwrap();
let b = SketchFeatures::from_tokens(&tokens).unwrap();
let sim = jaccard_similarity(&a.token_set, &b.token_set);
assert!(
sim > 0.8,
"Identical token sets should have high similarity, got {sim}"
);
}
#[test]
fn test_jaccard_similarity_disjoint_sets() {
let tokens_a = vec!["a", "b", "c"];
let tokens_b = vec!["x", "y", "z"];
let a = SketchFeatures::from_tokens(&tokens_a).unwrap();
let b = SketchFeatures::from_tokens(&tokens_b).unwrap();
let sim = jaccard_similarity(&a.token_set, &b.token_set);
assert!(
sim < 0.3,
"Disjoint token sets should have low similarity, got {sim}"
);
}
#[test]
fn test_jaccard_similarity_partial_overlap() {
let tokens_a = vec!["a", "b", "c", "d"];
let tokens_b = vec!["c", "d", "e", "f"];
let a = SketchFeatures::from_tokens(&tokens_a).unwrap();
let b = SketchFeatures::from_tokens(&tokens_b).unwrap();
let sim = jaccard_similarity(&a.token_set, &b.token_set);
assert!(
sim > 0.1 && sim < 0.7,
"Partial overlap should yield moderate similarity, got {sim}"
);
}
#[test]
fn test_jaccard_similarity_empty_sets() {
let a = SketchFeatures::from_tokens(&[]).unwrap();
let b = SketchFeatures::from_tokens(&[]).unwrap();
let sim = jaccard_similarity(&a.token_set, &b.token_set);
assert!(
(0.0..=1.0).contains(&sim),
"Empty set similarity should be in [0, 1], got {sim}"
);
}
#[test]
fn test_project_sketch_stats_new() {
let stats = ProjectSketchStats::new();
assert!(stats.is_ok());
}
#[test]
fn test_project_sketch_stats_record_call() {
let mut stats = ProjectSketchStats::new().unwrap();
for _ in 0..50 {
stats.record_call("process_data");
}
stats.record_call("init");
let freq = stats.estimated_calls("process_data");
assert!(freq >= 50, "Expected call frequency >= 50, got {freq}");
let freq_init = stats.estimated_calls("init");
assert!(
freq_init >= 1,
"Expected call frequency >= 1 for 'init', got {freq_init}"
);
let freq_none = stats.estimated_calls("nonexistent_function");
assert_eq!(freq_none, 0, "Non-recorded function should have freq 0");
}
#[test]
fn test_project_sketch_stats_record_snippet() {
let mut stats = ProjectSketchStats::new().unwrap();
for _ in 0..200 {
stats.record_snippet(b"fn main() {}");
}
for _ in 0..10 {
stats.record_snippet(b"println!(\"hello\")");
}
let top = stats.top_snippets();
assert!(!top.is_empty(), "Top snippets should not be empty");
}
#[test]
fn test_sketch_features_repeated_tokens() {
let tokens: Vec<&str> = (0..500).map(|_| "same_token").collect();
let features = SketchFeatures::from_tokens(&tokens).unwrap();
let est = features.estimated_unique_tokens();
assert!(
est < 5.0,
"Repeated token cardinality should be near 1, got {est}"
);
let freq = features.token_frequency("same_token");
assert!(freq >= 500, "Expected frequency >= 500, got {freq}");
}
}