Skip to main content

iriq/
cluster.rs

1use crate::classifier::{
2    file_kind, param_name_hint, FileKind, SegmentClassifier, SegmentType, DEFAULT_CLASSIFIER,
3};
4use crate::hints::SegmentHint;
5use crate::identifier::Identifier;
6use crate::position_stats::{PositionStats, DEFAULT_MAX_VALUES_PER_POSITION};
7use std::collections::{HashMap, HashSet};
8use std::sync::Arc;
9
10pub const MAX_CLUSTER_EXAMPLES: usize = 10;
11pub const DATE_CONFIDENCE_THRESHOLD: f64 = 0.8;
12pub const NUMBER_CONFIDENCE_THRESHOLD: f64 = 0.8;
13pub const NUMBER_SUBTYPE_THRESHOLD: f64 = 0.8;
14
15pub const ENUM_MIN_OBSERVATIONS: usize = 20;
16pub const ENUM_MAX_CARDINALITY: usize = 10;
17pub const ENUM_MIN_VALUE_COUNT: usize = 2;
18pub const ENUM_MIN_COVERAGE: f64 = 0.95;
19
20pub const YEAR_RANGE_MIN: f64 = 1900.0;
21pub const YEAR_RANGE_MAX: f64 = 2100.0;
22pub const YEAR_MIN_OBSERVATIONS: usize = 5;
23pub const YEAR_MIN_DISTINCT: usize = 2;
24pub const YEAR_MAX_DISTINCT: usize = 150;
25
26pub const HTTP_STATUS_RANGE_MIN: f64 = 100.0;
27pub const HTTP_STATUS_RANGE_MAX: f64 = 599.0;
28pub const HTTP_STATUS_MIN_OBSERVATIONS: usize = 5;
29pub const HTTP_STATUS_MIN_DISTINCT: usize = 2;
30pub const HTTP_STATUS_MAX_DISTINCT: usize = 30;
31
32#[derive(Debug, Clone)]
33pub struct SegmentPositionStat {
34    pub position: usize,
35    pub stable: bool,
36    pub values: HashMap<String, usize>,
37}
38
39#[derive(Debug, Clone)]
40pub struct Cluster {
41    pub key: String,
42    pub host: String,
43    pub scheme: String,
44    pub shape: String,
45    pub examples: Vec<Arc<Identifier>>,
46    pub count: usize,
47    pub segment_counts: Vec<HashMap<String, usize>>,
48    pub param_stats: HashMap<String, PositionStats>,
49    pub max_values: usize,
50    pub example_keys: HashSet<String>,
51}
52
53impl Cluster {
54    pub fn new(
55        key: String,
56        host: String,
57        scheme: String,
58        shape: String,
59        max_values: usize,
60    ) -> Self {
61        let cap = if max_values == 0 {
62            DEFAULT_MAX_VALUES_PER_POSITION
63        } else {
64            max_values
65        };
66        Cluster {
67            key,
68            host,
69            scheme,
70            shape,
71            examples: Vec::new(),
72            count: 0,
73            segment_counts: Vec::new(),
74            param_stats: HashMap::new(),
75            max_values: cap,
76            example_keys: HashSet::new(),
77        }
78    }
79
80    pub fn add(&mut self, iri: &Identifier) {
81        self.add_with(iri, &DEFAULT_CLASSIFIER)
82    }
83
84    pub fn add_with(&mut self, iri: &Identifier, classifier: &SegmentClassifier) {
85        self.count += 1;
86        if self.examples.len() < MAX_CLUSTER_EXAMPLES {
87            let canon = iri.canonical();
88            if self.example_keys.insert(canon) {
89                self.examples.push(Arc::new(iri.clone()));
90            }
91        }
92        for (i, seg) in iri.path_segments.iter().enumerate() {
93            while self.segment_counts.len() <= i {
94                self.segment_counts.push(HashMap::new());
95            }
96            *self.segment_counts[i].entry(seg.clone()).or_insert(0) += 1;
97        }
98        for (name, v) in iri.query_params.iter() {
99            let stats = self
100                .param_stats
101                .entry(name.to_string())
102                .or_insert_with(|| PositionStats::new(self.max_values));
103            stats.observe(v, classifier.classify(v));
104        }
105    }
106
107    pub fn register_example_key(&mut self, canon: String) {
108        self.example_keys.insert(canon);
109    }
110
111    pub fn segment_stats(&self) -> Vec<SegmentPositionStat> {
112        self.segment_counts
113            .iter()
114            .enumerate()
115            .map(|(i, counts)| SegmentPositionStat {
116                position: i,
117                stable: counts.len() == 1,
118                values: counts.clone(),
119            })
120            .collect()
121    }
122
123    pub fn param_summary(&self) -> Vec<ParamSummary> {
124        if self.param_stats.is_empty() {
125            return Vec::new();
126        }
127        let mut rows: Vec<ParamSummary> = self
128            .param_stats
129            .iter()
130            .map(|(name, stats)| {
131                let presence = if self.count > 0 {
132                    (stats.total as f64) / (self.count as f64)
133                } else {
134                    0.0
135                };
136                let ty = self.param_type(name);
137                let mut row = ParamSummary {
138                    name: name.clone(),
139                    count: stats.total,
140                    ty,
141                    cardinality: stats.cardinality(),
142                    presence,
143                    values: Vec::new(),
144                    numeric_count: 0,
145                    min: 0.0,
146                    max: 0.0,
147                    avg: 0.0,
148                    value_distribution: HashMap::new(),
149                    subtype_distribution: HashMap::new(),
150                    kind_distribution: HashMap::new(),
151                };
152                if row.ty == SegmentType::Enum {
153                    row.values = enum_values(stats);
154                }
155                if row.ty == SegmentType::Boolean || row.ty == SegmentType::Enum {
156                    row.value_distribution = value_distribution(stats);
157                }
158                if row.ty == SegmentType::Number {
159                    row.subtype_distribution =
160                        subtype_distribution(stats, &[SegmentType::Integer, SegmentType::Float]);
161                }
162                if row.ty == SegmentType::File {
163                    row.kind_distribution = file_kind_distribution(stats);
164                }
165                if stats.numeric_count > 0 {
166                    row.numeric_count = stats.numeric_count;
167                    row.min = stats.numeric_min;
168                    row.max = stats.numeric_max;
169                    row.avg = stats.numeric_avg();
170                }
171                row
172            })
173            .collect();
174        sort_param_summary(&mut rows);
175        rows
176    }
177
178    pub fn param_type(&self, name: &str) -> SegmentType {
179        let stats = match self.param_stats.get(name) {
180            Some(s) if s.total > 0 => s,
181            _ => return SegmentType::Literal,
182        };
183        let t = stats.dominant_type();
184
185        if is_year_position(t, stats) {
186            return SegmentType::Year;
187        }
188        if is_http_status_position(t, stats) {
189            return SegmentType::HttpStatus;
190        }
191
192        if is_enum(stats) && t != SegmentType::Boolean {
193            return SegmentType::Enum;
194        }
195
196        if t == SegmentType::Date {
197            let date_frac = (*stats.type_counts.get(&SegmentType::Date).unwrap_or(&0) as f64)
198                / (stats.total as f64);
199            if date_frac >= DATE_CONFIDENCE_THRESHOLD {
200                return t;
201            }
202            if let Some(alt) = dominant_excluding(stats, SegmentType::Date) {
203                return alt;
204            }
205            return SegmentType::Literal;
206        }
207
208        if t == SegmentType::Integer || t == SegmentType::Float {
209            let int_frac = (*stats.type_counts.get(&SegmentType::Integer).unwrap_or(&0) as f64)
210                / (stats.total as f64);
211            let float_frac = (*stats.type_counts.get(&SegmentType::Float).unwrap_or(&0) as f64)
212                / (stats.total as f64);
213            if int_frac < NUMBER_SUBTYPE_THRESHOLD
214                && float_frac < NUMBER_SUBTYPE_THRESHOLD
215                && (int_frac + float_frac) >= NUMBER_CONFIDENCE_THRESHOLD
216            {
217                return SegmentType::Number;
218            }
219        }
220
221        if let Some(hint) = param_name_hint(name, t) {
222            return hint;
223        }
224        t
225    }
226}
227
228#[derive(Debug, Clone)]
229pub struct ParamSummary {
230    pub name: String,
231    pub count: usize,
232    pub ty: SegmentType,
233    pub cardinality: usize,
234    pub presence: f64,
235    pub values: Vec<String>,
236    pub numeric_count: usize,
237    pub min: f64,
238    pub max: f64,
239    pub avg: f64,
240    pub value_distribution: HashMap<String, f64>,
241    pub subtype_distribution: HashMap<SegmentType, f64>,
242    pub kind_distribution: HashMap<FileKind, f64>,
243}
244
245fn round_frac(f: f64) -> f64 {
246    (f * 10000.0).round() / 10000.0
247}
248
249pub fn value_distribution(stats: &PositionStats) -> HashMap<String, f64> {
250    if stats.total == 0 {
251        return HashMap::new();
252    }
253    stats
254        .value_counts
255        .iter()
256        .map(|(v, n)| (v.clone(), round_frac((*n as f64) / (stats.total as f64))))
257        .collect()
258}
259
260pub fn subtype_distribution(
261    stats: &PositionStats,
262    subtypes: &[SegmentType],
263) -> HashMap<SegmentType, f64> {
264    if stats.total == 0 {
265        return HashMap::new();
266    }
267    let mut out = HashMap::new();
268    for &t in subtypes {
269        let n = *stats.type_counts.get(&t).unwrap_or(&0);
270        if n > 0 {
271            out.insert(t, round_frac((n as f64) / (stats.total as f64)));
272        }
273    }
274    out
275}
276
277pub fn file_kind_distribution(stats: &PositionStats) -> HashMap<FileKind, f64> {
278    if stats.value_counts.is_empty() {
279        return HashMap::new();
280    }
281    let total: usize = stats.value_counts.values().sum();
282    if total == 0 {
283        return HashMap::new();
284    }
285    let mut counts: HashMap<Option<FileKind>, usize> = HashMap::new();
286    for (v, n) in &stats.value_counts {
287        let k = file_kind(v);
288        *counts.entry(k).or_insert(0) += *n;
289    }
290    let mut out = HashMap::new();
291    for (k, n) in counts {
292        // Unknown values are bucketed separately in Go using FileKind("unknown");
293        // here we filter to known kinds. (Phase 1 omitted this nuance; phase 2 keeps it.)
294        if let Some(kind) = k {
295            out.insert(kind, round_frac((n as f64) / (total as f64)));
296        }
297    }
298    out
299}
300
301pub fn enum_values(stats: &PositionStats) -> Vec<String> {
302    let mut keys: Vec<String> = stats.value_counts.keys().cloned().collect();
303    keys.sort_by(|a, b| {
304        let na = stats.value_counts[a];
305        let nb = stats.value_counts[b];
306        nb.cmp(&na).then(a.cmp(b))
307    });
308    keys
309}
310
311pub fn is_enum(stats: &PositionStats) -> bool {
312    if stats.total < ENUM_MIN_OBSERVATIONS {
313        return false;
314    }
315    let card = stats.cardinality();
316    if card == 0 || card > ENUM_MAX_CARDINALITY {
317        return false;
318    }
319    let mut covered = 0usize;
320    for &n in stats.value_counts.values() {
321        if n < ENUM_MIN_VALUE_COUNT {
322            return false;
323        }
324        covered += n;
325    }
326    (covered as f64) / (stats.total as f64) >= ENUM_MIN_COVERAGE
327}
328
329pub fn is_year_position(t: SegmentType, stats: &PositionStats) -> bool {
330    if t != SegmentType::Integer || stats.numeric_count == 0 {
331        return false;
332    }
333    let card = stats.cardinality();
334    if !(YEAR_MIN_DISTINCT..=YEAR_MAX_DISTINCT).contains(&card) {
335        return false;
336    }
337    if stats.total < YEAR_MIN_OBSERVATIONS {
338        return false;
339    }
340    stats.numeric_min >= YEAR_RANGE_MIN
341        && stats.numeric_min <= YEAR_RANGE_MAX
342        && stats.numeric_max >= YEAR_RANGE_MIN
343        && stats.numeric_max <= YEAR_RANGE_MAX
344}
345
346pub fn is_http_status_position(t: SegmentType, stats: &PositionStats) -> bool {
347    if t != SegmentType::Integer || stats.numeric_count == 0 {
348        return false;
349    }
350    let card = stats.cardinality();
351    if !(HTTP_STATUS_MIN_DISTINCT..=HTTP_STATUS_MAX_DISTINCT).contains(&card) {
352        return false;
353    }
354    if stats.total < HTTP_STATUS_MIN_OBSERVATIONS {
355        return false;
356    }
357    stats.numeric_min >= HTTP_STATUS_RANGE_MIN
358        && stats.numeric_min <= HTTP_STATUS_RANGE_MAX
359        && stats.numeric_max >= HTTP_STATUS_RANGE_MIN
360        && stats.numeric_max <= HTTP_STATUS_RANGE_MAX
361}
362
363pub fn dominant_excluding(stats: &PositionStats, skip: SegmentType) -> Option<SegmentType> {
364    let mut best: Option<(SegmentType, usize)> = None;
365    for (&t, &n) in &stats.type_counts {
366        if t == skip {
367            continue;
368        }
369        best = match best {
370            None => Some((t, n)),
371            Some((bt, bn)) => {
372                if n > bn || (n == bn && t.as_str() < bt.as_str()) {
373                    Some((t, n))
374                } else {
375                    Some((bt, bn))
376                }
377            }
378        };
379    }
380    best.map(|(t, _)| t)
381}
382
383fn sort_param_summary(rows: &mut [ParamSummary]) {
384    rows.sort_by(|a, b| b.count.cmp(&a.count).then(a.name.cmp(&b.name)));
385}
386
387// Conveniences used by Cluster / Corpus when deriving keys.
388pub fn placeholder_for(e: &SegmentHint) -> String {
389    if !e.variable {
390        return e.value.clone();
391    }
392    if !e.hint.is_empty() {
393        return format!("{{{}}}", e.hint);
394    }
395    format!("{{{}}}", e.ty.as_str())
396}