use crate::record::Column;
use crate::value::Value;
use serde::{Deserialize, Serialize};
use std::collections::BTreeSet;
const SEQUENCE_MIN_LEN: usize = 20;
const ID_TOKENS: &[&str] = &[
"id",
"ids",
"uid",
"uuid",
"guid",
"gid",
"pid",
"procid",
"ppid",
"tid",
"sid",
"session",
"sessionid",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum Role {
Measurement,
Identifier,
Categorical,
Sequence,
Constant,
}
impl Role {
pub fn token(self) -> &'static str {
match self {
Role::Measurement => "measurement",
Role::Identifier => "identifier",
Role::Categorical => "categorical",
Role::Sequence => "sequence",
Role::Constant => "constant",
}
}
pub fn is_measured(self) -> bool {
matches!(self, Role::Measurement)
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ColumnRole {
pub column: String,
pub role: Role,
}
fn name_tokens(name: &str) -> Vec<String> {
let mut out = Vec::new();
let mut cur = String::new();
let mut prev_lower_or_digit = false;
for ch in name.chars() {
if ch.is_ascii_alphanumeric() {
if ch.is_ascii_uppercase() && prev_lower_or_digit && !cur.is_empty() {
out.push(std::mem::take(&mut cur));
}
cur.push(ch.to_ascii_lowercase());
prev_lower_or_digit = ch.is_ascii_lowercase() || ch.is_ascii_digit();
} else {
if !cur.is_empty() {
out.push(std::mem::take(&mut cur));
}
prev_lower_or_digit = false;
}
}
if !cur.is_empty() {
out.push(cur);
}
out
}
pub fn name_is_identifier(name: &str) -> bool {
name_tokens(name)
.iter()
.any(|t| ID_TOKENS.contains(&t.as_str()))
}
fn distinct_key(v: &Value) -> String {
match v {
Value::Null => String::new(),
Value::Bool(b) => format!("b{b}"),
Value::Int(i) => format!("i{i}"),
Value::Float(f) => format!("f{}", f.to_bits()),
Value::Str(s) => format!("s{s}"),
}
}
fn is_strictly_monotonic(xs: &[f64]) -> bool {
if xs.len() < 2 {
return false;
}
let increasing = xs.windows(2).all(|w| w[1] > w[0]);
let decreasing = xs.windows(2).all(|w| w[1] < w[0]);
increasing || decreasing
}
impl Column {
fn distinct_count(&self) -> usize {
let mut seen = BTreeSet::new();
for c in &self.cells {
if !matches!(c, Value::Null) {
seen.insert(distinct_key(c));
}
}
seen.len()
}
pub fn role(&self) -> Role {
if self.distinct_count() <= 1 {
return Role::Constant;
}
if name_is_identifier(&self.name) {
return Role::Identifier;
}
if self.ty.is_numeric() {
let xs = self.numeric();
if xs.len() >= SEQUENCE_MIN_LEN && is_strictly_monotonic(&xs) {
Role::Sequence
} else {
Role::Measurement
}
} else {
Role::Categorical
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn col(name: &str, cells: Vec<Value>) -> Column {
Column::new(name, cells)
}
fn ints(name: &str, xs: &[i64]) -> Column {
col(name, xs.iter().map(|&i| Value::Int(i)).collect())
}
#[test]
fn role_tokens_and_is_measured() {
assert_eq!(Role::Measurement.token(), "measurement");
assert_eq!(Role::Identifier.token(), "identifier");
assert_eq!(Role::Categorical.token(), "categorical");
assert_eq!(Role::Sequence.token(), "sequence");
assert_eq!(Role::Constant.token(), "constant");
assert!(Role::Measurement.is_measured());
for r in [
Role::Identifier,
Role::Categorical,
Role::Sequence,
Role::Constant,
] {
assert!(!r.is_measured(), "{:?} must not be measured", r);
}
}
#[test]
fn strictly_monotonic_predicate() {
assert!(is_strictly_monotonic(&[1.0, 2.0, 3.0])); assert!(is_strictly_monotonic(&[3.0, 2.0, 1.0])); assert!(is_strictly_monotonic(&[1.0, 2.0])); assert!(!is_strictly_monotonic(&[1.0, 1.0, 2.0])); assert!(!is_strictly_monotonic(&[3.0, 3.0, 1.0])); assert!(!is_strictly_monotonic(&[1.0, 3.0, 2.0])); assert!(!is_strictly_monotonic(&[5.0])); assert!(!is_strictly_monotonic(&[])); }
#[test]
fn name_tokenizer_splits_underscores_and_camel_case() {
assert_eq!(name_tokens("SYSLOG_PID"), vec!["syslog", "pid"]);
assert_eq!(name_tokens("durationNanos"), vec!["duration", "nanos"]);
assert_eq!(name_tokens("sessionId"), vec!["session", "id"]);
assert_eq!(name_tokens("_PID"), vec!["pid"]);
assert_eq!(name_tokens("JOB_ID"), vec!["job", "id"]);
}
#[test]
fn identifier_names_recognized_without_false_positives() {
for id in [
"_PID",
"_UID",
"_GID",
"JOB_ID",
"TID",
"SYSLOG_PID",
"user_id",
"uuid",
"procid",
] {
assert!(
name_is_identifier(id),
"{id} should look like an identifier"
);
}
for m in [
"DAYS_LOST",
"durationNanos",
"fare",
"age",
"humidity",
"valid",
"period",
] {
assert!(
!name_is_identifier(m),
"{m} must NOT look like an identifier"
);
}
}
#[test]
fn constant_takes_precedence() {
assert_eq!(ints("anything", &[5, 5, 5, 5]).role(), Role::Constant);
assert_eq!(ints("user_id", &[7, 7]).role(), Role::Constant);
}
#[test]
fn identifier_by_name_beats_a_numeric_distribution() {
let pid = ints("_PID", &[100, 200, 100, 300, 200, 100, 400, 300, 100]);
assert_eq!(pid.role(), Role::Identifier);
}
#[test]
fn long_strictly_monotonic_numeric_is_a_sequence() {
let up: Vec<i64> = (0..40).collect();
assert_eq!(ints("ts", &up).role(), Role::Sequence);
let down: Vec<i64> = (0..40).rev().collect();
assert_eq!(ints("countdown", &down).role(), Role::Sequence);
assert_eq!(
ints("small", &[10, 11, 14, 20, 31]).role(),
Role::Measurement
);
}
#[test]
fn near_constant_with_outliers_stays_measurement_not_categorical() {
let mut xs = vec![10i64; 30];
xs.push(1000);
assert_eq!(ints("x", &xs).role(), Role::Measurement);
}
#[test]
fn non_numeric_default_is_categorical() {
let msg = col(
"message",
(0..50).map(|i| Value::Str(format!("event {i}"))).collect(),
);
assert_eq!(msg.role(), Role::Categorical);
let same = col("kind", vec![Value::Str("a".into()); 5]);
assert_eq!(same.role(), Role::Constant);
}
}