Skip to main content

anomalyx_core/
roles.rs

1//! Column role classification — a deterministic, *transparent* read of what each
2//! column is: a continuous `Measurement`, an `Identifier` (arbitrary label), a
3//! low-cardinality `Categorical` code, a monotonic `Sequence`, or a `Constant`.
4//!
5//! Detectors consult the role to skip columns where their statistic is
6//! meaningless — a "point outlier" in a process-id or a severity-code column is
7//! noise, not signal. This is a heuristic, but never a *silent* one: every
8//! column's role ships in the envelope (so an agent can see and audit it), and
9//! the CLI's `--no-column-roles` disables role-based skipping entirely.
10//!
11//! Identifiers are recognized by **name** — the only reliable signal, since a
12//! process-id column is statistically indistinguishable from a discrete
13//! measurement. A continuous measurement (`fare`, `durationNanos`, `DAYS_LOST`)
14//! is never named like an id, so it is never misclassified by this rule.
15
16use crate::record::Column;
17use crate::value::Value;
18use serde::{Deserialize, Serialize};
19use std::collections::BTreeSet;
20
21/// A strictly-monotonic numeric column of at least this length is a sequence
22/// (a counter/timestamp ramp). Short columns don't carry enough evidence — and
23/// a near-constant column with a couple of outliers is *not* a sequence.
24const SEQUENCE_MIN_LEN: usize = 20;
25
26/// Name tokens that mark a column as an identifier (matched case-insensitively
27/// against the column name split on non-alphanumerics and camelCase).
28const ID_TOKENS: &[&str] = &[
29    "id",
30    "ids",
31    "uid",
32    "uuid",
33    "guid",
34    "gid",
35    "pid",
36    "procid",
37    "ppid",
38    "tid",
39    "sid",
40    "session",
41    "sessionid",
42];
43
44/// What a column appears to be. Only [`Role::Measurement`] columns are subject to
45/// magnitude-outlier (point) detection.
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
47#[serde(rename_all = "lowercase")]
48pub enum Role {
49    /// Continuous numeric measurement — outlier detection is meaningful.
50    Measurement,
51    /// An arbitrary label (process id, uuid, foreign key); magnitude is meaningless.
52    Identifier,
53    /// A low-cardinality code / enum / flag.
54    Categorical,
55    /// A strictly monotonic sequence (timestamp, counter, auto-increment id).
56    Sequence,
57    /// A single value, or none.
58    Constant,
59}
60
61impl Role {
62    pub fn token(self) -> &'static str {
63        match self {
64            Role::Measurement => "measurement",
65            Role::Identifier => "identifier",
66            Role::Categorical => "categorical",
67            Role::Sequence => "sequence",
68            Role::Constant => "constant",
69        }
70    }
71
72    /// Whether magnitude-outlier (point) detection is meaningful for this role.
73    pub fn is_measured(self) -> bool {
74        matches!(self, Role::Measurement)
75    }
76
77    /// Whether a *value-distribution* detector (point, contextual, collective,
78    /// distributional, multivariate) should skip a column of this role: an
79    /// `Identifier` is an arbitrary label and a `Sequence` is a monotonic ramp,
80    /// so any value-based anomaly statistic on them is noise, not signal. A
81    /// `Constant` is left to each detector (it naturally produces nothing), and a
82    /// `Categorical` is the chi-square detector's legitimate input.
83    pub fn skips_value_detection(self) -> bool {
84        matches!(self, Role::Identifier | Role::Sequence)
85    }
86}
87
88/// A column name paired with its classified role, for the envelope.
89#[derive(Debug, Clone, Serialize)]
90pub struct ColumnRole {
91    pub column: String,
92    pub role: Role,
93}
94
95/// Lowercased name tokens, split on non-alphanumerics and camelCase boundaries:
96/// `"SYSLOG_PID"` → `[syslog, pid]`, `"durationNanos"` → `[duration, nanos]`.
97fn name_tokens(name: &str) -> Vec<String> {
98    let mut out = Vec::new();
99    let mut cur = String::new();
100    let mut prev_lower_or_digit = false;
101    for ch in name.chars() {
102        if ch.is_ascii_alphanumeric() {
103            if ch.is_ascii_uppercase() && prev_lower_or_digit && !cur.is_empty() {
104                out.push(std::mem::take(&mut cur));
105            }
106            cur.push(ch.to_ascii_lowercase());
107            prev_lower_or_digit = ch.is_ascii_lowercase() || ch.is_ascii_digit();
108        } else {
109            if !cur.is_empty() {
110                out.push(std::mem::take(&mut cur));
111            }
112            prev_lower_or_digit = false;
113        }
114    }
115    if !cur.is_empty() {
116        out.push(cur);
117    }
118    out
119}
120
121/// Whether a column name reads as an identifier (any token in [`ID_TOKENS`]).
122pub fn name_is_identifier(name: &str) -> bool {
123    name_tokens(name)
124        .iter()
125        .any(|t| ID_TOKENS.contains(&t.as_str()))
126}
127
128/// Name tokens marking a clock/time column. A timestamp is a monotonic-ish clock
129/// value, never a measurement to outlier-test — and real timestamps (journald's
130/// `__REALTIME_TIMESTAMP`, a pcap `timestamp`) tie/regress just often enough to
131/// fail strict-monotonic [`Role::Sequence`] detection, so we also catch them by
132/// name. Kept narrow (`timestamp`/`ts`) to avoid `response_time`-style
133/// measurements you *do* want outliers on.
134const TIME_TOKENS: &[&str] = &["timestamp", "ts"];
135
136/// Whether a column name reads as a timestamp/clock column.
137pub fn name_is_timestamp(name: &str) -> bool {
138    name_tokens(name)
139        .iter()
140        .any(|t| TIME_TOKENS.contains(&t.as_str()))
141}
142
143/// A stable per-value key for distinct counting (NaN-safe via bit pattern).
144fn distinct_key(v: &Value) -> String {
145    match v {
146        Value::Null => String::new(),
147        Value::Bool(b) => format!("b{b}"),
148        Value::Int(i) => format!("i{i}"),
149        Value::Float(f) => format!("f{}", f.to_bits()),
150        Value::Str(s) => format!("s{s}"),
151    }
152}
153
154fn is_strictly_monotonic(xs: &[f64]) -> bool {
155    if xs.len() < 2 {
156        return false;
157    }
158    let increasing = xs.windows(2).all(|w| w[1] > w[0]);
159    let decreasing = xs.windows(2).all(|w| w[1] < w[0]);
160    increasing || decreasing
161}
162
163impl Column {
164    /// Number of distinct non-null values in this column.
165    fn distinct_count(&self) -> usize {
166        let mut seen = BTreeSet::new();
167        for c in &self.cells {
168            if !matches!(c, Value::Null) {
169                seen.insert(distinct_key(c));
170            }
171        }
172        seen.len()
173    }
174
175    /// Classifies this column's [`Role`]. Deterministic; order of checks matters:
176    /// constant first, then identifier (by name), then — for numeric columns — a
177    /// monotonic sequence else a continuous measurement; non-numeric columns that
178    /// are none of the above are categorical (labels/codes/free text).
179    ///
180    /// Cardinality is deliberately *not* used to call a numeric column
181    /// "categorical": a column that is one value with a few wild outliers has low
182    /// cardinality yet is exactly what point detection should catch. Identifiers
183    /// are caught by name, not by how many distinct values they hold.
184    pub fn role(&self) -> Role {
185        if self.distinct_count() <= 1 {
186            return Role::Constant;
187        }
188        if name_is_identifier(&self.name) {
189            return Role::Identifier;
190        }
191        // A clock column is a sequence regardless of type or exact monotonicity.
192        if name_is_timestamp(&self.name) {
193            return Role::Sequence;
194        }
195        if self.ty.is_numeric() {
196            let xs = self.numeric();
197            if xs.len() >= SEQUENCE_MIN_LEN && is_strictly_monotonic(&xs) {
198                Role::Sequence
199            } else {
200                Role::Measurement
201            }
202        } else {
203            Role::Categorical
204        }
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    fn col(name: &str, cells: Vec<Value>) -> Column {
213        Column::new(name, cells)
214    }
215
216    fn ints(name: &str, xs: &[i64]) -> Column {
217        col(name, xs.iter().map(|&i| Value::Int(i)).collect())
218    }
219
220    #[test]
221    fn role_tokens_and_is_measured() {
222        assert_eq!(Role::Measurement.token(), "measurement");
223        assert_eq!(Role::Identifier.token(), "identifier");
224        assert_eq!(Role::Categorical.token(), "categorical");
225        assert_eq!(Role::Sequence.token(), "sequence");
226        assert_eq!(Role::Constant.token(), "constant");
227        // Only a measurement is subject to magnitude-outlier detection.
228        assert!(Role::Measurement.is_measured());
229        for r in [
230            Role::Identifier,
231            Role::Categorical,
232            Role::Sequence,
233            Role::Constant,
234        ] {
235            assert!(!r.is_measured(), "{:?} must not be measured", r);
236        }
237    }
238
239    #[test]
240    fn skips_value_detection_targets_identifier_and_sequence() {
241        assert!(Role::Identifier.skips_value_detection());
242        assert!(Role::Sequence.skips_value_detection());
243        // Measurement is analyzed; Categorical feeds chi-square; Constant is
244        // left to each detector to no-op — none are skipped by this gate.
245        for r in [Role::Measurement, Role::Categorical, Role::Constant] {
246            assert!(!r.skips_value_detection(), "{:?} must not be skipped", r);
247        }
248    }
249
250    #[test]
251    fn strictly_monotonic_predicate() {
252        assert!(is_strictly_monotonic(&[1.0, 2.0, 3.0])); // increasing
253        assert!(is_strictly_monotonic(&[3.0, 2.0, 1.0])); // decreasing
254        assert!(is_strictly_monotonic(&[1.0, 2.0])); // len 2 still decides (pins the len guard)
255        assert!(!is_strictly_monotonic(&[1.0, 1.0, 2.0])); // not strict (equal step up)
256        assert!(!is_strictly_monotonic(&[3.0, 3.0, 1.0])); // not strict (equal step down)
257        assert!(!is_strictly_monotonic(&[1.0, 3.0, 2.0])); // up then down
258        assert!(!is_strictly_monotonic(&[5.0])); // too short
259        assert!(!is_strictly_monotonic(&[])); // empty
260    }
261
262    #[test]
263    fn name_tokenizer_splits_underscores_and_camel_case() {
264        assert_eq!(name_tokens("SYSLOG_PID"), vec!["syslog", "pid"]);
265        assert_eq!(name_tokens("durationNanos"), vec!["duration", "nanos"]);
266        assert_eq!(name_tokens("sessionId"), vec!["session", "id"]);
267        assert_eq!(name_tokens("_PID"), vec!["pid"]);
268        assert_eq!(name_tokens("JOB_ID"), vec!["job", "id"]);
269    }
270
271    #[test]
272    fn identifier_names_recognized_without_false_positives() {
273        for id in [
274            "_PID",
275            "_UID",
276            "_GID",
277            "JOB_ID",
278            "TID",
279            "SYSLOG_PID",
280            "user_id",
281            "uuid",
282            "procid",
283        ] {
284            assert!(
285                name_is_identifier(id),
286                "{id} should look like an identifier"
287            );
288        }
289        // Continuous measurements are never named like ids — no false positives.
290        for m in [
291            "DAYS_LOST",
292            "durationNanos",
293            "fare",
294            "age",
295            "humidity",
296            "valid",
297            "period",
298        ] {
299            assert!(
300                !name_is_identifier(m),
301                "{m} must NOT look like an identifier"
302            );
303        }
304    }
305
306    #[test]
307    fn timestamp_named_columns_are_sequences() {
308        // journald's near-monotonic clocks tie/regress, so strict-monotonic
309        // detection misses them; the name catches them. Numeric or not.
310        for ts in [
311            "__REALTIME_TIMESTAMP",
312            "__MONOTONIC_TIMESTAMP",
313            "timestamp",
314            "ts",
315        ] {
316            assert!(name_is_timestamp(ts), "{ts} should read as a timestamp");
317            // a jittery (non-strictly-monotonic) clock column → Sequence by name
318            let jittery = ints(ts, &[100, 101, 101, 105, 104, 110, 130]);
319            assert_eq!(jittery.role(), Role::Sequence, "{ts}");
320        }
321        // but a real measurement whose name merely contains "time" is NOT a
322        // timestamp — we still want outliers on it.
323        for m in ["response_time", "DAYS_LOST", "duration_ms", "fare"] {
324            assert!(!name_is_timestamp(m), "{m} must NOT read as a timestamp");
325        }
326    }
327
328    #[test]
329    fn constant_takes_precedence() {
330        assert_eq!(ints("anything", &[5, 5, 5, 5]).role(), Role::Constant);
331        // Even an id-named single-value column is Constant (distinct <= 1 first).
332        assert_eq!(ints("user_id", &[7, 7]).role(), Role::Constant);
333    }
334
335    #[test]
336    fn identifier_by_name_beats_a_numeric_distribution() {
337        // A process-id column is mid-cardinality with repeats — statistically a
338        // discrete measurement, but its name gives it away.
339        let pid = ints("_PID", &[100, 200, 100, 300, 200, 100, 400, 300, 100]);
340        assert_eq!(pid.role(), Role::Identifier);
341    }
342
343    #[test]
344    fn long_strictly_monotonic_numeric_is_a_sequence() {
345        let up: Vec<i64> = (0..40).collect();
346        assert_eq!(ints("ts", &up).role(), Role::Sequence);
347        let down: Vec<i64> = (0..40).rev().collect();
348        assert_eq!(ints("countdown", &down).role(), Role::Sequence);
349        // A short monotonic run is NOT enough evidence (below SEQUENCE_MIN_LEN).
350        assert_eq!(
351            ints("small", &[10, 11, 14, 20, 31]).role(),
352            Role::Measurement
353        );
354    }
355
356    #[test]
357    fn near_constant_with_outliers_stays_measurement_not_categorical() {
358        // Low cardinality (2 distinct), but this is the canonical point-outlier
359        // shape — it must remain a measurement, never be skipped as a category.
360        let mut xs = vec![10i64; 30];
361        xs.push(1000);
362        assert_eq!(ints("x", &xs).role(), Role::Measurement);
363    }
364
365    #[test]
366    fn non_numeric_default_is_categorical() {
367        let msg = col(
368            "message",
369            (0..50).map(|i| Value::Str(format!("event {i}"))).collect(),
370        );
371        assert_eq!(msg.role(), Role::Categorical);
372        // A constant string column is still Constant (distinct <= 1 wins).
373        let same = col("kind", vec![Value::Str("a".into()); 5]);
374        assert_eq!(same.role(), Role::Constant);
375    }
376}