1use crate::classifier::{
2 file_kind, param_name_hint, FileKind, SegmentClassifier, SegmentType, DEFAULT_CLASSIFIER,
3};
4use crate::hints::SegmentHint;
5use crate::identifier::Identifier;
6use crate::position_stats::{PositionStats, DEFAULT_MAX_VALUES_PER_POSITION};
7use std::collections::{HashMap, HashSet};
8use std::sync::Arc;
9
10pub const MAX_CLUSTER_EXAMPLES: usize = 10;
11pub const DATE_CONFIDENCE_THRESHOLD: f64 = 0.8;
12pub const NUMBER_CONFIDENCE_THRESHOLD: f64 = 0.8;
13pub const NUMBER_SUBTYPE_THRESHOLD: f64 = 0.8;
14
15pub const ENUM_MIN_OBSERVATIONS: usize = 20;
16pub const ENUM_MAX_CARDINALITY: usize = 10;
17pub const ENUM_MIN_VALUE_COUNT: usize = 2;
18pub const ENUM_MIN_COVERAGE: f64 = 0.95;
19
20pub const YEAR_RANGE_MIN: f64 = 1900.0;
21pub const YEAR_RANGE_MAX: f64 = 2100.0;
22pub const YEAR_MIN_OBSERVATIONS: usize = 5;
23pub const YEAR_MIN_DISTINCT: usize = 2;
24pub const YEAR_MAX_DISTINCT: usize = 150;
25
26pub const HTTP_STATUS_RANGE_MIN: f64 = 100.0;
27pub const HTTP_STATUS_RANGE_MAX: f64 = 599.0;
28pub const HTTP_STATUS_MIN_OBSERVATIONS: usize = 5;
29pub const HTTP_STATUS_MIN_DISTINCT: usize = 2;
30pub const HTTP_STATUS_MAX_DISTINCT: usize = 30;
31
32#[derive(Debug, Clone)]
33pub struct SegmentPositionStat {
34 pub position: usize,
35 pub stable: bool,
36 pub values: HashMap<String, usize>,
37}
38
39#[derive(Debug, Clone)]
40pub struct Cluster {
41 pub key: String,
42 pub host: String,
43 pub scheme: String,
44 pub shape: String,
45 pub examples: Vec<Arc<Identifier>>,
46 pub count: usize,
47 pub segment_counts: Vec<HashMap<String, usize>>,
48 pub param_stats: HashMap<String, PositionStats>,
49 pub max_values: usize,
50 pub example_keys: HashSet<String>,
51}
52
53impl Cluster {
54 pub fn new(
55 key: String,
56 host: String,
57 scheme: String,
58 shape: String,
59 max_values: usize,
60 ) -> Self {
61 let cap = if max_values == 0 {
62 DEFAULT_MAX_VALUES_PER_POSITION
63 } else {
64 max_values
65 };
66 Cluster {
67 key,
68 host,
69 scheme,
70 shape,
71 examples: Vec::new(),
72 count: 0,
73 segment_counts: Vec::new(),
74 param_stats: HashMap::new(),
75 max_values: cap,
76 example_keys: HashSet::new(),
77 }
78 }
79
80 pub fn add(&mut self, iri: &Identifier) {
81 self.add_with(iri, &DEFAULT_CLASSIFIER)
82 }
83
84 pub fn add_with(&mut self, iri: &Identifier, classifier: &SegmentClassifier) {
85 self.count += 1;
86 if self.examples.len() < MAX_CLUSTER_EXAMPLES {
87 let canon = iri.canonical();
88 if self.example_keys.insert(canon) {
89 self.examples.push(Arc::new(iri.clone()));
90 }
91 }
92 for (i, seg) in iri.path_segments.iter().enumerate() {
93 while self.segment_counts.len() <= i {
94 self.segment_counts.push(HashMap::new());
95 }
96 *self.segment_counts[i].entry(seg.clone()).or_insert(0) += 1;
97 }
98 for (name, v) in iri.query_params.iter() {
99 let stats = self
100 .param_stats
101 .entry(name.to_string())
102 .or_insert_with(|| PositionStats::new(self.max_values));
103 stats.observe(v, classifier.classify(v));
104 }
105 }
106
107 pub fn register_example_key(&mut self, canon: String) {
108 self.example_keys.insert(canon);
109 }
110
111 pub fn segment_stats(&self) -> Vec<SegmentPositionStat> {
112 self.segment_counts
113 .iter()
114 .enumerate()
115 .map(|(i, counts)| SegmentPositionStat {
116 position: i,
117 stable: counts.len() == 1,
118 values: counts.clone(),
119 })
120 .collect()
121 }
122
123 pub fn param_summary(&self) -> Vec<ParamSummary> {
124 if self.param_stats.is_empty() {
125 return Vec::new();
126 }
127 let mut rows: Vec<ParamSummary> = self
128 .param_stats
129 .iter()
130 .map(|(name, stats)| {
131 let presence = if self.count > 0 {
132 (stats.total as f64) / (self.count as f64)
133 } else {
134 0.0
135 };
136 let ty = self.param_type(name);
137 let mut row = ParamSummary {
138 name: name.clone(),
139 count: stats.total,
140 ty,
141 cardinality: stats.cardinality(),
142 presence,
143 values: Vec::new(),
144 numeric_count: 0,
145 min: 0.0,
146 max: 0.0,
147 avg: 0.0,
148 value_distribution: HashMap::new(),
149 subtype_distribution: HashMap::new(),
150 kind_distribution: HashMap::new(),
151 };
152 if row.ty == SegmentType::Enum {
153 row.values = enum_values(stats);
154 }
155 if row.ty == SegmentType::Boolean || row.ty == SegmentType::Enum {
156 row.value_distribution = value_distribution(stats);
157 }
158 if row.ty == SegmentType::Number {
159 row.subtype_distribution =
160 subtype_distribution(stats, &[SegmentType::Integer, SegmentType::Float]);
161 }
162 if row.ty == SegmentType::File {
163 row.kind_distribution = file_kind_distribution(stats);
164 }
165 if stats.numeric_count > 0 {
166 row.numeric_count = stats.numeric_count;
167 row.min = stats.numeric_min;
168 row.max = stats.numeric_max;
169 row.avg = stats.numeric_avg();
170 }
171 row
172 })
173 .collect();
174 sort_param_summary(&mut rows);
175 rows
176 }
177
178 pub fn param_type(&self, name: &str) -> SegmentType {
179 let stats = match self.param_stats.get(name) {
180 Some(s) if s.total > 0 => s,
181 _ => return SegmentType::Literal,
182 };
183 let t = stats.dominant_type();
184
185 if is_year_position(t, stats) {
186 return SegmentType::Year;
187 }
188 if is_http_status_position(t, stats) {
189 return SegmentType::HttpStatus;
190 }
191
192 if is_enum(stats) && t != SegmentType::Boolean {
193 return SegmentType::Enum;
194 }
195
196 if t == SegmentType::Date {
197 let date_frac = (*stats.type_counts.get(&SegmentType::Date).unwrap_or(&0) as f64)
198 / (stats.total as f64);
199 if date_frac >= DATE_CONFIDENCE_THRESHOLD {
200 return t;
201 }
202 if let Some(alt) = dominant_excluding(stats, SegmentType::Date) {
203 return alt;
204 }
205 return SegmentType::Literal;
206 }
207
208 if t == SegmentType::Integer || t == SegmentType::Float {
209 let int_frac = (*stats.type_counts.get(&SegmentType::Integer).unwrap_or(&0) as f64)
210 / (stats.total as f64);
211 let float_frac = (*stats.type_counts.get(&SegmentType::Float).unwrap_or(&0) as f64)
212 / (stats.total as f64);
213 if int_frac < NUMBER_SUBTYPE_THRESHOLD
214 && float_frac < NUMBER_SUBTYPE_THRESHOLD
215 && (int_frac + float_frac) >= NUMBER_CONFIDENCE_THRESHOLD
216 {
217 return SegmentType::Number;
218 }
219 }
220
221 if let Some(hint) = param_name_hint(name, t) {
222 return hint;
223 }
224 t
225 }
226}
227
228#[derive(Debug, Clone)]
229pub struct ParamSummary {
230 pub name: String,
231 pub count: usize,
232 pub ty: SegmentType,
233 pub cardinality: usize,
234 pub presence: f64,
235 pub values: Vec<String>,
236 pub numeric_count: usize,
237 pub min: f64,
238 pub max: f64,
239 pub avg: f64,
240 pub value_distribution: HashMap<String, f64>,
241 pub subtype_distribution: HashMap<SegmentType, f64>,
242 pub kind_distribution: HashMap<FileKind, f64>,
243}
244
245fn round_frac(f: f64) -> f64 {
246 (f * 10000.0).round() / 10000.0
247}
248
249pub fn value_distribution(stats: &PositionStats) -> HashMap<String, f64> {
250 if stats.total == 0 {
251 return HashMap::new();
252 }
253 stats
254 .value_counts
255 .iter()
256 .map(|(v, n)| (v.clone(), round_frac((*n as f64) / (stats.total as f64))))
257 .collect()
258}
259
260pub fn subtype_distribution(
261 stats: &PositionStats,
262 subtypes: &[SegmentType],
263) -> HashMap<SegmentType, f64> {
264 if stats.total == 0 {
265 return HashMap::new();
266 }
267 let mut out = HashMap::new();
268 for &t in subtypes {
269 let n = *stats.type_counts.get(&t).unwrap_or(&0);
270 if n > 0 {
271 out.insert(t, round_frac((n as f64) / (stats.total as f64)));
272 }
273 }
274 out
275}
276
277pub fn file_kind_distribution(stats: &PositionStats) -> HashMap<FileKind, f64> {
278 if stats.value_counts.is_empty() {
279 return HashMap::new();
280 }
281 let total: usize = stats.value_counts.values().sum();
282 if total == 0 {
283 return HashMap::new();
284 }
285 let mut counts: HashMap<Option<FileKind>, usize> = HashMap::new();
286 for (v, n) in &stats.value_counts {
287 let k = file_kind(v);
288 *counts.entry(k).or_insert(0) += *n;
289 }
290 let mut out = HashMap::new();
291 for (k, n) in counts {
292 if let Some(kind) = k {
295 out.insert(kind, round_frac((n as f64) / (total as f64)));
296 }
297 }
298 out
299}
300
301pub fn enum_values(stats: &PositionStats) -> Vec<String> {
302 let mut keys: Vec<String> = stats.value_counts.keys().cloned().collect();
303 keys.sort_by(|a, b| {
304 let na = stats.value_counts[a];
305 let nb = stats.value_counts[b];
306 nb.cmp(&na).then(a.cmp(b))
307 });
308 keys
309}
310
311pub fn is_enum(stats: &PositionStats) -> bool {
312 if stats.total < ENUM_MIN_OBSERVATIONS {
313 return false;
314 }
315 let card = stats.cardinality();
316 if card == 0 || card > ENUM_MAX_CARDINALITY {
317 return false;
318 }
319 let mut covered = 0usize;
320 for &n in stats.value_counts.values() {
321 if n < ENUM_MIN_VALUE_COUNT {
322 return false;
323 }
324 covered += n;
325 }
326 (covered as f64) / (stats.total as f64) >= ENUM_MIN_COVERAGE
327}
328
329pub fn is_year_position(t: SegmentType, stats: &PositionStats) -> bool {
330 if t != SegmentType::Integer || stats.numeric_count == 0 {
331 return false;
332 }
333 let card = stats.cardinality();
334 if !(YEAR_MIN_DISTINCT..=YEAR_MAX_DISTINCT).contains(&card) {
335 return false;
336 }
337 if stats.total < YEAR_MIN_OBSERVATIONS {
338 return false;
339 }
340 stats.numeric_min >= YEAR_RANGE_MIN
341 && stats.numeric_min <= YEAR_RANGE_MAX
342 && stats.numeric_max >= YEAR_RANGE_MIN
343 && stats.numeric_max <= YEAR_RANGE_MAX
344}
345
346pub fn is_http_status_position(t: SegmentType, stats: &PositionStats) -> bool {
347 if t != SegmentType::Integer || stats.numeric_count == 0 {
348 return false;
349 }
350 let card = stats.cardinality();
351 if !(HTTP_STATUS_MIN_DISTINCT..=HTTP_STATUS_MAX_DISTINCT).contains(&card) {
352 return false;
353 }
354 if stats.total < HTTP_STATUS_MIN_OBSERVATIONS {
355 return false;
356 }
357 stats.numeric_min >= HTTP_STATUS_RANGE_MIN
358 && stats.numeric_min <= HTTP_STATUS_RANGE_MAX
359 && stats.numeric_max >= HTTP_STATUS_RANGE_MIN
360 && stats.numeric_max <= HTTP_STATUS_RANGE_MAX
361}
362
363pub fn dominant_excluding(stats: &PositionStats, skip: SegmentType) -> Option<SegmentType> {
364 let mut best: Option<(SegmentType, usize)> = None;
365 for (&t, &n) in &stats.type_counts {
366 if t == skip {
367 continue;
368 }
369 best = match best {
370 None => Some((t, n)),
371 Some((bt, bn)) => {
372 if n > bn || (n == bn && t.as_str() < bt.as_str()) {
373 Some((t, n))
374 } else {
375 Some((bt, bn))
376 }
377 }
378 };
379 }
380 best.map(|(t, _)| t)
381}
382
383fn sort_param_summary(rows: &mut [ParamSummary]) {
384 rows.sort_by(|a, b| b.count.cmp(&a.count).then(a.name.cmp(&b.name)));
385}
386
387pub fn placeholder_for(e: &SegmentHint) -> String {
389 if !e.variable {
390 return e.value.clone();
391 }
392 if !e.hint.is_empty() {
393 return format!("{{{}}}", e.hint);
394 }
395 format!("{{{}}}", e.ty.as_str())
396}