1use crate::record::Column;
17use crate::value::Value;
18use serde::{Deserialize, Serialize};
19use std::collections::BTreeSet;
20
21const SEQUENCE_MIN_LEN: usize = 20;
25
26const ID_TOKENS: &[&str] = &[
29 "id",
30 "ids",
31 "uid",
32 "uuid",
33 "guid",
34 "gid",
35 "pid",
36 "procid",
37 "ppid",
38 "tid",
39 "sid",
40 "session",
41 "sessionid",
42];
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
47#[serde(rename_all = "lowercase")]
48pub enum Role {
49 Measurement,
51 Identifier,
53 Categorical,
55 Sequence,
57 Constant,
59}
60
61impl Role {
62 pub fn token(self) -> &'static str {
63 match self {
64 Role::Measurement => "measurement",
65 Role::Identifier => "identifier",
66 Role::Categorical => "categorical",
67 Role::Sequence => "sequence",
68 Role::Constant => "constant",
69 }
70 }
71
72 pub fn is_measured(self) -> bool {
74 matches!(self, Role::Measurement)
75 }
76
77 pub fn skips_value_detection(self) -> bool {
84 matches!(self, Role::Identifier | Role::Sequence)
85 }
86}
87
88#[derive(Debug, Clone, Serialize)]
90pub struct ColumnRole {
91 pub column: String,
92 pub role: Role,
93}
94
95fn name_tokens(name: &str) -> Vec<String> {
98 let mut out = Vec::new();
99 let mut cur = String::new();
100 let mut prev_lower_or_digit = false;
101 for ch in name.chars() {
102 if ch.is_ascii_alphanumeric() {
103 if ch.is_ascii_uppercase() && prev_lower_or_digit && !cur.is_empty() {
104 out.push(std::mem::take(&mut cur));
105 }
106 cur.push(ch.to_ascii_lowercase());
107 prev_lower_or_digit = ch.is_ascii_lowercase() || ch.is_ascii_digit();
108 } else {
109 if !cur.is_empty() {
110 out.push(std::mem::take(&mut cur));
111 }
112 prev_lower_or_digit = false;
113 }
114 }
115 if !cur.is_empty() {
116 out.push(cur);
117 }
118 out
119}
120
121pub fn name_is_identifier(name: &str) -> bool {
123 name_tokens(name)
124 .iter()
125 .any(|t| ID_TOKENS.contains(&t.as_str()))
126}
127
128const TIME_TOKENS: &[&str] = &["timestamp", "ts"];
135
136pub fn name_is_timestamp(name: &str) -> bool {
138 name_tokens(name)
139 .iter()
140 .any(|t| TIME_TOKENS.contains(&t.as_str()))
141}
142
143fn distinct_key(v: &Value) -> String {
145 match v {
146 Value::Null => String::new(),
147 Value::Bool(b) => format!("b{b}"),
148 Value::Int(i) => format!("i{i}"),
149 Value::Float(f) => format!("f{}", f.to_bits()),
150 Value::Str(s) => format!("s{s}"),
151 }
152}
153
154fn is_strictly_monotonic(xs: &[f64]) -> bool {
155 if xs.len() < 2 {
156 return false;
157 }
158 let increasing = xs.windows(2).all(|w| w[1] > w[0]);
159 let decreasing = xs.windows(2).all(|w| w[1] < w[0]);
160 increasing || decreasing
161}
162
163impl Column {
164 fn distinct_count(&self) -> usize {
166 let mut seen = BTreeSet::new();
167 for c in &self.cells {
168 if !matches!(c, Value::Null) {
169 seen.insert(distinct_key(c));
170 }
171 }
172 seen.len()
173 }
174
175 pub fn role(&self) -> Role {
185 if self.distinct_count() <= 1 {
186 return Role::Constant;
187 }
188 if name_is_identifier(&self.name) {
189 return Role::Identifier;
190 }
191 if name_is_timestamp(&self.name) {
193 return Role::Sequence;
194 }
195 if self.ty.is_numeric() {
196 let xs = self.numeric();
197 if xs.len() >= SEQUENCE_MIN_LEN && is_strictly_monotonic(&xs) {
198 Role::Sequence
199 } else {
200 Role::Measurement
201 }
202 } else {
203 Role::Categorical
204 }
205 }
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 fn col(name: &str, cells: Vec<Value>) -> Column {
213 Column::new(name, cells)
214 }
215
216 fn ints(name: &str, xs: &[i64]) -> Column {
217 col(name, xs.iter().map(|&i| Value::Int(i)).collect())
218 }
219
220 #[test]
221 fn role_tokens_and_is_measured() {
222 assert_eq!(Role::Measurement.token(), "measurement");
223 assert_eq!(Role::Identifier.token(), "identifier");
224 assert_eq!(Role::Categorical.token(), "categorical");
225 assert_eq!(Role::Sequence.token(), "sequence");
226 assert_eq!(Role::Constant.token(), "constant");
227 assert!(Role::Measurement.is_measured());
229 for r in [
230 Role::Identifier,
231 Role::Categorical,
232 Role::Sequence,
233 Role::Constant,
234 ] {
235 assert!(!r.is_measured(), "{:?} must not be measured", r);
236 }
237 }
238
239 #[test]
240 fn skips_value_detection_targets_identifier_and_sequence() {
241 assert!(Role::Identifier.skips_value_detection());
242 assert!(Role::Sequence.skips_value_detection());
243 for r in [Role::Measurement, Role::Categorical, Role::Constant] {
246 assert!(!r.skips_value_detection(), "{:?} must not be skipped", r);
247 }
248 }
249
250 #[test]
251 fn strictly_monotonic_predicate() {
252 assert!(is_strictly_monotonic(&[1.0, 2.0, 3.0])); assert!(is_strictly_monotonic(&[3.0, 2.0, 1.0])); assert!(is_strictly_monotonic(&[1.0, 2.0])); assert!(!is_strictly_monotonic(&[1.0, 1.0, 2.0])); assert!(!is_strictly_monotonic(&[3.0, 3.0, 1.0])); assert!(!is_strictly_monotonic(&[1.0, 3.0, 2.0])); assert!(!is_strictly_monotonic(&[5.0])); assert!(!is_strictly_monotonic(&[])); }
261
262 #[test]
263 fn name_tokenizer_splits_underscores_and_camel_case() {
264 assert_eq!(name_tokens("SYSLOG_PID"), vec!["syslog", "pid"]);
265 assert_eq!(name_tokens("durationNanos"), vec!["duration", "nanos"]);
266 assert_eq!(name_tokens("sessionId"), vec!["session", "id"]);
267 assert_eq!(name_tokens("_PID"), vec!["pid"]);
268 assert_eq!(name_tokens("JOB_ID"), vec!["job", "id"]);
269 }
270
271 #[test]
272 fn identifier_names_recognized_without_false_positives() {
273 for id in [
274 "_PID",
275 "_UID",
276 "_GID",
277 "JOB_ID",
278 "TID",
279 "SYSLOG_PID",
280 "user_id",
281 "uuid",
282 "procid",
283 ] {
284 assert!(
285 name_is_identifier(id),
286 "{id} should look like an identifier"
287 );
288 }
289 for m in [
291 "DAYS_LOST",
292 "durationNanos",
293 "fare",
294 "age",
295 "humidity",
296 "valid",
297 "period",
298 ] {
299 assert!(
300 !name_is_identifier(m),
301 "{m} must NOT look like an identifier"
302 );
303 }
304 }
305
306 #[test]
307 fn timestamp_named_columns_are_sequences() {
308 for ts in [
311 "__REALTIME_TIMESTAMP",
312 "__MONOTONIC_TIMESTAMP",
313 "timestamp",
314 "ts",
315 ] {
316 assert!(name_is_timestamp(ts), "{ts} should read as a timestamp");
317 let jittery = ints(ts, &[100, 101, 101, 105, 104, 110, 130]);
319 assert_eq!(jittery.role(), Role::Sequence, "{ts}");
320 }
321 for m in ["response_time", "DAYS_LOST", "duration_ms", "fare"] {
324 assert!(!name_is_timestamp(m), "{m} must NOT read as a timestamp");
325 }
326 }
327
328 #[test]
329 fn constant_takes_precedence() {
330 assert_eq!(ints("anything", &[5, 5, 5, 5]).role(), Role::Constant);
331 assert_eq!(ints("user_id", &[7, 7]).role(), Role::Constant);
333 }
334
335 #[test]
336 fn identifier_by_name_beats_a_numeric_distribution() {
337 let pid = ints("_PID", &[100, 200, 100, 300, 200, 100, 400, 300, 100]);
340 assert_eq!(pid.role(), Role::Identifier);
341 }
342
343 #[test]
344 fn long_strictly_monotonic_numeric_is_a_sequence() {
345 let up: Vec<i64> = (0..40).collect();
346 assert_eq!(ints("ts", &up).role(), Role::Sequence);
347 let down: Vec<i64> = (0..40).rev().collect();
348 assert_eq!(ints("countdown", &down).role(), Role::Sequence);
349 assert_eq!(
351 ints("small", &[10, 11, 14, 20, 31]).role(),
352 Role::Measurement
353 );
354 }
355
356 #[test]
357 fn near_constant_with_outliers_stays_measurement_not_categorical() {
358 let mut xs = vec![10i64; 30];
361 xs.push(1000);
362 assert_eq!(ints("x", &xs).role(), Role::Measurement);
363 }
364
365 #[test]
366 fn non_numeric_default_is_categorical() {
367 let msg = col(
368 "message",
369 (0..50).map(|i| Value::Str(format!("event {i}"))).collect(),
370 );
371 assert_eq!(msg.role(), Role::Categorical);
372 let same = col("kind", vec![Value::Str("a".into()); 5]);
374 assert_eq!(same.role(), Role::Constant);
375 }
376}