use std::io::BufRead;
use ahash::{AHashMap, AHashSet};
use crate::config::Config;
use crate::url::{parse_path, parse_url};
pub struct CardinalityReport {
pub position_stats: AHashMap<usize, PositionInfo>,
pub query_param_stats: AHashMap<String, QueryParamInfo>,
}
pub struct PositionInfo {
pub values: usize,
pub sample: Vec<String>,
}
pub struct QueryParamInfo {
pub unique_values: usize,
#[allow(dead_code)]
pub sample: Vec<String>,
}
pub struct AnalyzedInput {
pub report: CardinalityReport,
}
#[must_use]
pub fn analyze_cardinality<R: BufRead>(
reader: R,
_config: &Config,
assume_scheme: &str,
path_only: bool,
) -> AnalyzedInput {
let mut position_values: AHashMap<usize, Vec<String>> = AHashMap::new();
let mut query_param_values: AHashMap<String, Vec<String>> = AHashMap::new();
for line in reader.lines() {
let Ok(line) = line else { continue };
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let parsed = if path_only {
parse_path(trimmed)
} else {
parse_url(trimmed, assume_scheme)
};
let Some(parsed) = parsed else { continue };
let segments: Vec<&str> = parsed.path.split('/').filter(|s| !s.is_empty()).collect();
for (i, segment) in segments.iter().enumerate() {
position_values
.entry(i)
.or_default()
.push(segment.to_string());
}
if let Some(query) = &parsed.query {
let pairs: Vec<_> = url::form_urlencoded::parse(query.as_bytes()).collect();
for (key, value) in pairs {
query_param_values
.entry(key.to_string())
.or_default()
.push(value.to_string());
}
}
}
let mut position_stats = AHashMap::new();
for (pos, values) in position_values {
let unique: AHashSet<_> = values.iter().cloned().collect();
let sample: Vec<_> = unique.iter().take(10).cloned().collect();
position_stats.insert(
pos,
PositionInfo {
values: unique.len(),
sample,
},
);
}
let mut query_param_stats = AHashMap::new();
for (key, values) in query_param_values {
let unique: AHashSet<_> = values.iter().cloned().collect();
let sample: Vec<_> = unique.iter().take(10).cloned().collect();
query_param_stats.insert(
key,
QueryParamInfo {
unique_values: unique.len(),
sample,
},
);
}
AnalyzedInput {
report: CardinalityReport {
position_stats,
query_param_stats,
},
}
}
pub fn print_cardinality_report(report: &CardinalityReport) {
eprintln!("\n=== Cardinality Analysis ===\n");
eprintln!("--- Path Segment Analysis ---");
eprintln!("{:<10} {:<30} {:<10} Decision", "Position", "Pattern", "Unique");
eprintln!("{}", "-".repeat(70));
let mut positions: Vec<_> = report.position_stats.iter().collect();
positions.sort_by_key(|(pos, _)| **pos);
for (pos, info) in positions {
let sample_str = info.sample.join(", ");
let decision = if info.values > 500 {
"DYNAMIC (entropy)"
} else if info.values > 50 {
"DYNAMIC (pattern)"
} else if info.values > 5 {
"MIXED"
} else {
"STATIC"
};
eprintln!("{:<10} {:<30} {:<10} {}", pos, sample_str, info.values, decision);
}
eprintln!("\n--- Query Parameter Analysis ---");
eprintln!("{:<20} {:<10} Decision", "Key", "Unique");
eprintln!("{}", "-".repeat(50));
let mut params: Vec<_> = report.query_param_stats.iter().collect();
params.sort_by_key(|(key, _)| key.as_str());
for (key, info) in params {
let decision = if info.unique_values > 50 {
"DYNAMIC (normalize)"
} else if info.unique_values > 5 {
"MIXED"
} else {
"STATIC (keep)"
};
eprintln!("{:<20} {:<10} {}", key, info.unique_values, decision);
}
eprintln!();
}
#[must_use]
pub fn build_learned_config(report: &CardinalityReport) -> Config {
let mut always_normalize = Vec::new();
let mut never_normalize = Vec::new();
for (key, info) in &report.query_param_stats {
if info.unique_values > 50 {
always_normalize.push(key.clone());
} else if info.unique_values <= 5 {
never_normalize.push(key.clone());
}
}
always_normalize.sort();
never_normalize.sort();
let mut pattern_segments = Vec::new();
for (_pos, info) in &report.position_stats {
if info.values > 50 {
if info.values > 500 {
continue;
}
let is_numeric = info.sample.iter().all(|s| s.chars().all(|c| c.is_ascii_digit()));
let is_uuid = info.sample.iter().all(|s| {
s.len() == 36
&& s.chars().filter(|c| *c == '-').count() == 4
&& s.chars().all(|c| c.is_ascii_hexdigit() || c == '-')
});
if is_uuid {
pattern_segments.push(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}".into());
} else if is_numeric {
pattern_segments.push(r"\d+".into());
}
}
}
pattern_segments.sort();
pattern_segments.dedup();
Config {
general: crate::config::GeneralConfig::default(),
normalize_params: crate::config::NormalizeParamsConfig {
always_normalize,
never_normalize,
},
structural: crate::config::StructuralConfig {
literal_segments: crate::config::StructuralConfig::default().literal_segments,
pattern_segments,
},
}
}
pub fn save_learned_config(report: &CardinalityReport, path: &str) -> Result<(), String> {
let config = build_learned_config(report);
let toml_str = toml::to_string(&config)
.map_err(|e| format!("Failed to serialize config: {e}"))?;
std::fs::write(path, toml_str)
.map_err(|e| format!("Failed to write config file: {e}"))?;
eprintln!("Learned config saved to {path}");
Ok(())
}