use hyperloglog::HyperLogLog;
use indexmap::IndexMap;
use rhai::Dynamic;
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::hash::{Hash, Hasher};
use unicode_width::UnicodeWidthStr;
const EXACT_CARDINALITY_THRESHOLD: usize = 256;
const MAX_SAMPLES: usize = 8;
const MAX_SAMPLE_LEN: usize = 80;
const MAX_TRACKED_FIELDS: usize = 1_000;
const MAX_FLATTEN_DEPTH: usize = 3;
const MAX_DEDUP_TRACKING: usize = 1024;
const HLL_ERROR_RATE: f64 = 0.01;
const HLL_SEED: u128 = 0x6669656c645f646973636f76657279;
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum FieldType {
String,
Int,
Float,
Bool,
Null,
Array,
Map,
Char,
Other(std::string::String),
}
impl FieldType {
pub fn from_dynamic(value: &Dynamic) -> Self {
if value.is_unit() {
FieldType::Null
} else if value.is_string() {
FieldType::String
} else if value.is_int() {
FieldType::Int
} else if value.is_float() {
FieldType::Float
} else if value.is_bool() {
FieldType::Bool
} else if value.is_char() {
FieldType::Char
} else if value.is_array() {
FieldType::Array
} else if value.is_map() {
FieldType::Map
} else {
FieldType::Other(value.type_name().to_string())
}
}
}
impl fmt::Display for FieldType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FieldType::String => write!(f, "string"),
FieldType::Int => write!(f, "int"),
FieldType::Float => write!(f, "float"),
FieldType::Bool => write!(f, "bool"),
FieldType::Null => write!(f, "null"),
FieldType::Array => write!(f, "array"),
FieldType::Map => write!(f, "map"),
FieldType::Char => write!(f, "char"),
FieldType::Other(s) => write!(f, "{}", s),
}
}
}
enum CardinalityTracker {
Exact(HashSet<u64>),
Estimated(HyperLogLog),
}
impl CardinalityTracker {
fn new() -> Self {
CardinalityTracker::Exact(HashSet::new())
}
fn insert(&mut self, hash: u64) {
match self {
CardinalityTracker::Exact(set) => {
set.insert(hash);
if set.len() > EXACT_CARDINALITY_THRESHOLD {
let mut hll = HyperLogLog::new_deterministic(HLL_ERROR_RATE, HLL_SEED);
for &existing in set.iter() {
hll.insert(&existing);
}
*self = CardinalityTracker::Estimated(hll);
}
}
CardinalityTracker::Estimated(hll) => {
hll.insert(&hash);
}
}
}
fn cardinality(&self) -> (usize, bool) {
match self {
CardinalityTracker::Exact(set) => (set.len(), true),
CardinalityTracker::Estimated(hll) => (hll.len() as usize, false),
}
}
}
pub struct FieldProfile {
pub seen_count: usize,
pub type_counts: HashMap<FieldType, usize>,
cardinality: CardinalityTracker,
pub samples: Vec<serde_json::Value>,
sample_hashes: HashSet<u64>,
distinct_samples_seen: usize,
pub array_size_range: Option<(usize, usize)>,
pub map_size_range: Option<(usize, usize)>,
}
impl FieldProfile {
fn new() -> Self {
Self {
seen_count: 0,
type_counts: HashMap::new(),
cardinality: CardinalityTracker::new(),
samples: Vec::new(),
sample_hashes: HashSet::new(),
distinct_samples_seen: 0,
array_size_range: None,
map_size_range: None,
}
}
fn observe(&mut self, value: &Dynamic) {
self.seen_count += 1;
let ft = FieldType::from_dynamic(value);
*self.type_counts.entry(ft.clone()).or_insert(0) += 1;
match ft {
FieldType::Null => {
}
FieldType::Array => {
if let Ok(arr) = value.clone().into_array() {
let len = arr.len();
self.array_size_range = Some(match self.array_size_range {
Some((lo, hi)) => (lo.min(len), hi.max(len)),
None => (len, len),
});
}
}
FieldType::Map => {
if let Some(map) = value.clone().try_cast::<rhai::Map>() {
let len = map.len();
self.map_size_range = Some(match self.map_size_range {
Some((lo, hi)) => (lo.min(len), hi.max(len)),
None => (len, len),
});
}
}
_ => {
let display = scalar_display(value);
let sample_value = scalar_to_json(value);
let hash = hash_value(&ft, &display);
self.cardinality.insert(hash);
self.add_sample(hash, sample_value);
}
}
}
fn add_sample(&mut self, hash: u64, sample: serde_json::Value) {
if self.sample_hashes.len() < MAX_DEDUP_TRACKING {
if !self.sample_hashes.insert(hash) {
return;
}
} else if self.sample_hashes.contains(&hash) {
return;
}
self.distinct_samples_seen += 1;
if self.samples.len() < MAX_SAMPLES {
self.samples.push(sample);
} else {
let idx = fastrand::usize(0..self.distinct_samples_seen);
if idx < MAX_SAMPLES {
self.samples[idx] = sample;
}
}
}
pub fn cardinality(&self) -> (usize, bool) {
self.cardinality.cardinality()
}
pub fn types_by_frequency(&self) -> Vec<(FieldType, usize)> {
let mut types: Vec<_> = self
.type_counts
.iter()
.map(|(ft, &c)| (ft.clone(), c))
.collect();
types.sort_by(|a, b| b.1.cmp(&a.1));
types
}
}
pub struct FieldDiscovery {
pub fields: IndexMap<std::string::String, FieldProfile>,
pub total_events: usize,
capped: bool,
flatten_depth_capped: bool,
}
impl FieldDiscovery {
pub fn new() -> Self {
Self {
fields: IndexMap::new(),
total_events: 0,
capped: false,
flatten_depth_capped: false,
}
}
pub fn observe_event(&mut self, fields: &IndexMap<std::string::String, Dynamic>) {
self.total_events += 1;
for (key, value) in fields {
self.observe_path(key, value, 1);
}
}
fn observe_path(&mut self, path: &str, value: &Dynamic, depth: usize) {
self.record(path, value);
if depth >= MAX_FLATTEN_DEPTH {
if value.is_map() || value.is_array() {
self.flatten_depth_capped = true;
}
return;
}
if value.is_map() {
if let Some(map) = value.clone().try_cast::<rhai::Map>() {
for (k, v) in map.iter() {
let subkey = format!("{path}.{k}");
self.observe_path(&subkey, v, depth + 1);
}
}
} else if value.is_array() {
if let Ok(arr) = value.clone().into_array() {
let subkey = format!("{path}[]");
for elem in arr.iter() {
self.observe_path(&subkey, elem, depth + 1);
}
}
}
}
fn record(&mut self, path: &str, value: &Dynamic) {
if let Some(profile) = self.fields.get_mut(path) {
profile.observe(value);
} else {
if self.fields.len() >= MAX_TRACKED_FIELDS {
if !self.capped {
self.capped = true;
eprintln!(
"Warning: field discovery truncated at {} unique field names",
MAX_TRACKED_FIELDS
);
}
return;
}
let mut profile = FieldProfile::new();
profile.observe(value);
self.fields.insert(path.to_string(), profile);
}
}
pub fn format_table(&self) -> std::string::String {
self.format_table_for_width(crate::tty::get_terminal_width())
}
fn format_table_for_width(&self, terminal_width: usize) -> std::string::String {
if self.fields.is_empty() {
return format!("Scanned {} events: no fields found\n", self.total_events);
}
let mut output = std::string::String::new();
output.push_str(&format!("Scanned {} events\n\n", self.total_events));
let terminal_width = terminal_width.max(36);
let mut entries: Vec<_> = self.fields.iter().collect();
entries.sort_by(|a, b| b.1.seen_count.cmp(&a.1.seen_count));
let rows: Vec<_> = entries
.iter()
.map(|(name, profile)| DiscoveryRow::from_profile(self.total_events, name, profile))
.collect();
if let Some(widths) = TableWidths::for_full_table(terminal_width, &rows) {
output.push_str(&pad_right_display("Field", widths.name));
output.push_str(" ");
output.push_str(&pad_right_display("Type", widths.types));
output.push_str(" ");
output.push_str(&pad_left_display("Seen", widths.seen));
output.push_str(" ");
output.push_str(&pad_left_display("Miss", widths.miss));
output.push_str(" ");
output.push_str(&pad_left_display("Uniq", widths.unique));
output.push_str(" Examples\n");
for row in &rows {
output.push_str(&pad_right_display(
&truncate_for_display(&row.name, widths.name),
widths.name,
));
output.push_str(" ");
output.push_str(&pad_right_display(
&truncate_for_display(&row.types, widths.types),
widths.types,
));
output.push_str(" ");
output.push_str(&pad_left_display(&row.seen_count.to_string(), widths.seen));
output.push_str(" ");
output.push_str(&pad_left_display(
&format!("{:.0}%", row.miss_pct),
widths.miss,
));
output.push_str(" ");
output.push_str(&pad_left_display(&row.unique, widths.unique));
output.push_str(" ");
output.push_str(&truncate_for_display(&row.examples, widths.examples));
output.push('\n');
}
} else if let Some(widths) = TableWidths::for_compact_table(terminal_width, &rows) {
output.push_str(&pad_right_display("Field", widths.name));
output.push_str(" ");
output.push_str(&pad_right_display("Type", widths.types));
output.push_str(" ");
output.push_str(&pad_left_display("Seen", widths.seen));
output.push_str(" ");
output.push_str(&pad_left_display("Miss", widths.miss));
output.push_str(" ");
output.push_str(&pad_left_display("Uniq", widths.unique));
output.push('\n');
for row in &rows {
output.push_str(&pad_right_display(
&truncate_for_display(&row.name, widths.name),
widths.name,
));
output.push_str(" ");
output.push_str(&pad_right_display(
&truncate_for_display(&row.types, widths.types),
widths.types,
));
output.push_str(" ");
output.push_str(&pad_left_display(&row.seen_count.to_string(), widths.seen));
output.push_str(" ");
output.push_str(&pad_left_display(
&format!("{:.0}%", row.miss_pct),
widths.miss,
));
output.push_str(" ");
output.push_str(&pad_left_display(&row.unique, widths.unique));
output.push('\n');
if !row.examples.is_empty() {
output.push_str(" ");
output.push_str(&truncate_for_display(
&row.examples,
terminal_width.saturating_sub(2),
));
output.push('\n');
}
}
} else {
for (idx, row) in rows.iter().enumerate() {
if idx > 0 {
output.push('\n');
}
output.push_str(&truncate_for_display(&row.name, terminal_width));
output.push('\n');
output.push_str(&format!(
" seen: {} miss: {:.0}%\n",
row.seen_count, row.miss_pct
));
output.push_str(" type: ");
output.push_str(&truncate_for_display(
&row.types,
terminal_width.saturating_sub(8),
));
output.push('\n');
output.push_str(" unique: ");
output.push_str(&truncate_for_display(
&row.unique,
terminal_width.saturating_sub(10),
));
output.push('\n');
if !row.examples.is_empty() {
output.push_str(" examples: ");
output.push_str(&truncate_for_display(
&row.examples,
terminal_width.saturating_sub(16),
));
output.push('\n');
}
}
}
if self.capped {
output.push_str(&format!(
"\n(Field tracking capped at {} unique field names)\n",
MAX_TRACKED_FIELDS
));
}
if self.flatten_depth_capped {
output.push_str(&format!(
"\nNote: Nested field flattening stopped at depth {}; deeper children are not shown.\n",
MAX_FLATTEN_DEPTH
));
}
output
}
pub fn format_json(&self) -> std::string::String {
let mut fields_json = Vec::new();
for (name, profile) in &self.fields {
let types: Vec<serde_json::Value> = profile
.types_by_frequency()
.iter()
.map(|(ft, count)| {
serde_json::json!({
"type": ft.to_string(),
"count": count,
})
})
.collect();
let (card_count, card_exact) = profile.cardinality();
let mut field_obj = serde_json::json!({
"name": name,
"seen": profile.seen_count,
"missing": self.total_events.saturating_sub(profile.seen_count),
"types": types,
"cardinality": {
"count": card_count,
"exact": card_exact,
},
"samples": profile.samples,
});
if let Some((lo, hi)) = profile.array_size_range {
field_obj["array_size"] = serde_json::json!({"min": lo, "max": hi});
}
if let Some((lo, hi)) = profile.map_size_range {
field_obj["map_size"] = serde_json::json!({"min": lo, "max": hi});
}
fields_json.push(field_obj);
}
let result = serde_json::json!({
"total_events": self.total_events,
"fields": fields_json,
"truncated": self.capped,
"flatten_depth_limit": MAX_FLATTEN_DEPTH,
"flatten_depth_capped": self.flatten_depth_capped,
});
serde_json::to_string_pretty(&result).unwrap_or_else(|_| "{}".to_string())
}
}
fn format_types(profile: &FieldProfile) -> std::string::String {
let types = profile.types_by_frequency();
if types.is_empty() {
return "-".to_string();
}
let parts: Vec<std::string::String> = types
.iter()
.map(|(ft, _count)| match ft {
FieldType::Array => {
if let Some((lo, hi)) = profile.array_size_range {
if lo == hi {
format!("array({})", lo)
} else {
format!("array({}..{})", lo, hi)
}
} else {
"array".to_string()
}
}
FieldType::Map => {
if let Some((lo, hi)) = profile.map_size_range {
if lo == hi {
format!("map({})", lo)
} else {
format!("map({}..{})", lo, hi)
}
} else {
"map".to_string()
}
}
_ => ft.to_string(),
})
.collect();
parts.join(", ")
}
fn format_cardinality(profile: &FieldProfile) -> std::string::String {
let (count, exact) = profile.cardinality();
let has_scalar = profile
.type_counts
.keys()
.any(|ft| !matches!(ft, FieldType::Map | FieldType::Array | FieldType::Null));
if !has_scalar || count == 0 {
return "\u{2014}".to_string(); }
if exact {
format!("{}", count)
} else {
format!("~{}", count)
}
}
fn format_examples(profile: &FieldProfile) -> std::string::String {
if profile.samples.is_empty() {
return std::string::String::new();
}
let joined = profile
.samples
.iter()
.map(sample_json_display)
.collect::<Vec<_>>()
.join(", ");
if joined.chars().count() > 60 {
truncate_for_display(&joined, 60)
} else {
joined
}
}
fn scalar_display(value: &Dynamic) -> std::string::String {
if value.is_string() {
if let Ok(s) = value.clone().into_string() {
return s;
}
}
if value.is_bool() {
if let Ok(b) = value.as_bool() {
return b.to_string();
}
}
if value.is_int() {
if let Ok(i) = value.as_int() {
return i.to_string();
}
}
if value.is_float() {
if let Ok(f) = value.as_float() {
return format!("{f}");
}
}
if value.is_char() {
if let Ok(c) = value.as_char() {
return c.to_string();
}
}
value.to_string()
}
fn scalar_to_json(value: &Dynamic) -> serde_json::Value {
if value.is_string() {
value
.clone()
.into_string()
.map(serde_json::Value::String)
.unwrap_or(serde_json::Value::Null)
} else if value.is_int() {
value
.as_int()
.map(|i| serde_json::Value::Number(serde_json::Number::from(i)))
.unwrap_or(serde_json::Value::Null)
} else if value.is_float() {
value
.as_float()
.ok()
.and_then(serde_json::Number::from_f64)
.map(serde_json::Value::Number)
.unwrap_or(serde_json::Value::Null)
} else if value.is_bool() {
value
.as_bool()
.map(serde_json::Value::Bool)
.unwrap_or(serde_json::Value::Null)
} else if value.is_char() {
value
.as_char()
.map(|c| serde_json::Value::String(c.to_string()))
.unwrap_or(serde_json::Value::Null)
} else if value.is_unit() {
serde_json::Value::Null
} else {
serde_json::Value::String(value.to_string())
}
}
fn sample_json_display(value: &serde_json::Value) -> std::string::String {
match value {
serde_json::Value::String(s) => truncate_sample(s),
serde_json::Value::Null => "null".to_string(),
_ => truncate_sample(&value.to_string()),
}
}
fn hash_value(ft: &FieldType, display: &str) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
ft.hash(&mut hasher);
display.hash(&mut hasher);
hasher.finish()
}
fn truncate_sample(s: &str) -> std::string::String {
if s.chars().count() <= MAX_SAMPLE_LEN {
s.to_string()
} else {
truncate_for_display(s, MAX_SAMPLE_LEN)
}
}
fn truncate_for_display(s: &str, max_chars: usize) -> std::string::String {
if max_chars <= 3 {
return ".".repeat(max_chars);
}
let char_count = s.chars().count();
if char_count <= max_chars {
return s.to_string();
}
let keep = max_chars - 3;
let mut out = s.chars().take(keep).collect::<std::string::String>();
out.push_str("...");
out
}
fn display_width(s: &str) -> usize {
UnicodeWidthStr::width(s)
}
fn pad_right_display(s: &str, width: usize) -> std::string::String {
let current = display_width(s);
if current >= width {
return s.to_string();
}
format!("{s}{}", " ".repeat(width - current))
}
fn pad_left_display(s: &str, width: usize) -> std::string::String {
let current = display_width(s);
if current >= width {
return s.to_string();
}
format!("{}{s}", " ".repeat(width - current))
}
struct TableWidths {
name: usize,
seen: usize,
miss: usize,
types: usize,
unique: usize,
examples: usize,
}
impl TableWidths {
fn for_full_table(terminal_width: usize, rows: &[DiscoveryRow]) -> Option<Self> {
let seen = row_width(rows, |row| row.seen_count.to_string().len(), "Seen");
let miss = 4;
let unique = row_width(rows, |row| display_width(&row.unique), "Uniq");
let separators = 10;
let min_name = 12;
let min_types = 6;
let min_examples = 8;
let max_name = row_width(rows, |row| display_width(&row.name), "Field").clamp(min_name, 40);
let max_types =
row_width(rows, |row| display_width(&row.types), "Type").clamp(min_types, 30);
let max_examples = row_width(rows, |row| display_width(&row.examples), "Examples");
let available = terminal_width.checked_sub(seen + miss + unique + separators)?;
if available < min_name + min_types + min_examples {
return None;
}
let mut name = min_name.min(max_name);
let mut types = min_types.min(max_types);
let mut examples = min_examples.min(max_examples.max(min_examples));
let mut remaining = available.saturating_sub(name + types + examples);
let name_target = max_name.min(26);
let type_target = max_types.min(18);
grow_width(&mut types, type_target, &mut remaining);
grow_width(&mut name, name_target, &mut remaining);
grow_width(&mut types, max_types, &mut remaining);
grow_width(&mut name, max_name, &mut remaining);
grow_width(&mut examples, max_examples, &mut remaining);
Some(Self {
name,
seen,
miss,
types,
unique,
examples,
})
}
fn for_compact_table(terminal_width: usize, rows: &[DiscoveryRow]) -> Option<Self> {
let seen = row_width(rows, |row| row.seen_count.to_string().len(), "Seen");
let miss = 4;
let unique = row_width(rows, |row| display_width(&row.unique), "Uniq");
let separators = 8;
let min_name = 12;
let min_types = 6;
let max_name = row_width(rows, |row| display_width(&row.name), "Field").clamp(min_name, 40);
let max_types =
row_width(rows, |row| display_width(&row.types), "Type").clamp(min_types, 30);
let available = terminal_width.checked_sub(seen + miss + unique + separators)?;
if available < min_name + min_types {
return None;
}
let mut types = max_types.min((available / 3).max(min_types));
let mut name = available.saturating_sub(types);
if name < min_name || types < min_types {
return None;
}
if name > max_name {
let extra = name - max_name;
name = max_name;
types = (types + extra).min(max_types);
}
Some(Self {
name,
seen,
miss,
types,
unique,
examples: 0,
})
}
}
fn row_width<F>(rows: &[DiscoveryRow], f: F, header: &str) -> usize
where
F: Fn(&DiscoveryRow) -> usize,
{
rows.iter()
.map(f)
.max()
.unwrap_or(0)
.max(display_width(header))
}
fn grow_width(current: &mut usize, target: usize, remaining: &mut usize) {
if *current >= target || *remaining == 0 {
return;
}
let growth = (target - *current).min(*remaining);
*current += growth;
*remaining -= growth;
}
struct DiscoveryRow {
name: std::string::String,
seen_count: usize,
miss_pct: f64,
types: std::string::String,
unique: std::string::String,
examples: std::string::String,
}
impl DiscoveryRow {
fn from_profile(total_events: usize, name: &str, profile: &FieldProfile) -> Self {
let missing = total_events.saturating_sub(profile.seen_count);
let miss_pct = if total_events > 0 {
(missing as f64 / total_events as f64) * 100.0
} else {
0.0
};
Self {
name: name.to_string(),
seen_count: profile.seen_count,
miss_pct,
types: format_types(profile),
unique: format_cardinality(profile),
examples: format_examples(profile),
}
}
}
use std::cell::RefCell;
use std::sync::atomic::{AtomicBool, Ordering};
static ENABLED: AtomicBool = AtomicBool::new(false);
static DISCOVER_FINAL: AtomicBool = AtomicBool::new(false);
thread_local! {
static THREAD_DISCOVERY: RefCell<FieldDiscovery> = RefCell::new(FieldDiscovery::new());
}
pub fn enable(discover_final: bool) {
ENABLED.store(true, Ordering::Relaxed);
DISCOVER_FINAL.store(discover_final, Ordering::Relaxed);
}
pub fn is_enabled() -> bool {
ENABLED.load(Ordering::Relaxed)
}
pub fn is_discover_final() -> bool {
DISCOVER_FINAL.load(Ordering::Relaxed)
}
pub fn observe_event_fields(fields: &IndexMap<String, Dynamic>) {
if !is_enabled() {
return;
}
THREAD_DISCOVERY.with(|d| d.borrow_mut().observe_event(fields));
}
pub fn take_thread_discovery() -> FieldDiscovery {
THREAD_DISCOVERY.with(|d| {
let mut discovery = d.borrow_mut();
std::mem::replace(&mut *discovery, FieldDiscovery::new())
})
}
#[cfg(test)]
mod tests {
use super::*;
fn make_string(s: &str) -> Dynamic {
Dynamic::from(s.to_string())
}
fn make_int(i: i64) -> Dynamic {
Dynamic::from(i)
}
fn make_float(f: f64) -> Dynamic {
Dynamic::from(f)
}
fn make_bool(b: bool) -> Dynamic {
Dynamic::from(b)
}
fn make_null() -> Dynamic {
Dynamic::UNIT
}
fn make_array(items: Vec<Dynamic>) -> Dynamic {
Dynamic::from(rhai::Array::from(items))
}
fn make_map(pairs: Vec<(&str, Dynamic)>) -> Dynamic {
let mut map = rhai::Map::new();
for (k, v) in pairs {
map.insert(k.into(), v);
}
Dynamic::from(map)
}
#[test]
fn test_field_type_classification() {
assert_eq!(
FieldType::from_dynamic(&make_string("hello")),
FieldType::String
);
assert_eq!(FieldType::from_dynamic(&make_int(42)), FieldType::Int);
assert_eq!(FieldType::from_dynamic(&make_float(2.5)), FieldType::Float);
assert_eq!(FieldType::from_dynamic(&make_bool(true)), FieldType::Bool);
assert_eq!(FieldType::from_dynamic(&make_null()), FieldType::Null);
assert_eq!(
FieldType::from_dynamic(&make_array(vec![])),
FieldType::Array
);
assert_eq!(FieldType::from_dynamic(&make_map(vec![])), FieldType::Map);
}
#[test]
fn test_basic_field_profile() {
let mut profile = FieldProfile::new();
profile.observe(&make_string("hello"));
profile.observe(&make_string("world"));
profile.observe(&make_string("hello"));
assert_eq!(profile.seen_count, 3);
assert_eq!(profile.type_counts[&FieldType::String], 3);
let (card, exact) = profile.cardinality();
assert!(exact);
assert_eq!(card, 2); assert_eq!(profile.samples.len(), 2);
}
#[test]
fn test_mixed_types() {
let mut profile = FieldProfile::new();
profile.observe(&make_int(200));
profile.observe(&make_int(404));
profile.observe(&make_string("N/A"));
assert_eq!(profile.seen_count, 3);
assert_eq!(profile.type_counts[&FieldType::Int], 2);
assert_eq!(profile.type_counts[&FieldType::String], 1);
let types = profile.types_by_frequency();
assert_eq!(types[0].0, FieldType::Int); }
#[test]
fn test_null_not_counted_in_cardinality() {
let mut profile = FieldProfile::new();
profile.observe(&make_null());
profile.observe(&make_null());
profile.observe(&make_string("value"));
assert_eq!(profile.seen_count, 3);
assert_eq!(profile.type_counts[&FieldType::Null], 2);
let (card, exact) = profile.cardinality();
assert!(exact);
assert_eq!(card, 1); }
#[test]
fn test_int_vs_string_distinct_cardinality() {
let mut profile = FieldProfile::new();
profile.observe(&make_int(42));
profile.observe(&make_string("42"));
let (card, exact) = profile.cardinality();
assert!(exact);
assert_eq!(card, 2); }
#[test]
fn test_array_size_range() {
let mut profile = FieldProfile::new();
profile.observe(&make_array(vec![make_int(1), make_int(2)]));
profile.observe(&make_array(vec![
make_int(1),
make_int(2),
make_int(3),
make_int(4),
make_int(5),
]));
assert_eq!(profile.array_size_range, Some((2, 5)));
let (card, _) = profile.cardinality();
assert_eq!(card, 0); }
#[test]
fn test_map_size_range() {
let mut profile = FieldProfile::new();
profile.observe(&make_map(vec![("a", make_int(1))]));
profile.observe(&make_map(vec![
("a", make_int(1)),
("b", make_int(2)),
("c", make_int(3)),
]));
assert_eq!(profile.map_size_range, Some((1, 3)));
}
#[test]
fn test_hll_graduation() {
let mut profile = FieldProfile::new();
for i in 0..300 {
profile.observe(&make_int(i));
}
let (card, exact) = profile.cardinality();
assert!(!exact, "Should have graduated to HLL");
assert!(
(270..=330).contains(&card),
"HLL estimate {} out of range",
card
);
}
#[test]
fn test_field_discovery_basic() {
let mut discovery = FieldDiscovery::new();
let mut fields1 = IndexMap::new();
fields1.insert("level".to_string(), make_string("INFO"));
fields1.insert("message".to_string(), make_string("hello"));
fields1.insert("status".to_string(), make_int(200));
let mut fields2 = IndexMap::new();
fields2.insert("level".to_string(), make_string("ERROR"));
fields2.insert("message".to_string(), make_string("fail"));
discovery.observe_event(&fields1);
discovery.observe_event(&fields2);
assert_eq!(discovery.total_events, 2);
assert_eq!(discovery.fields.len(), 3);
assert_eq!(discovery.fields["level"].seen_count, 2);
assert_eq!(discovery.fields["status"].seen_count, 1);
}
#[test]
fn test_format_table_not_empty() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert("level".to_string(), make_string("INFO"));
fields.insert("msg".to_string(), make_string("test"));
discovery.observe_event(&fields);
let table = discovery.format_table();
assert!(table.contains("Scanned 1 events"));
assert!(table.contains("level"));
assert!(table.contains("msg"));
assert!(table.contains("string"));
}
#[test]
fn test_format_table_compact_layout_on_medium_width() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert(
"very.long.field.name".to_string(),
make_string("this is a long example value"),
);
discovery.observe_event(&fields);
let table = discovery.format_table_for_width(56);
assert!(table.contains("Field"));
assert!(table.contains("Type"));
assert!(table.contains("Seen"));
assert!(table.contains("Miss"));
assert!(table.contains("Uniq"));
assert!(!table.contains(" examples: "));
assert!(!table.contains(" seen: "));
}
#[test]
fn test_format_table_narrow_layout_on_small_width() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert("request_id".to_string(), make_string("req_001"));
discovery.observe_event(&fields);
let table = discovery.format_table_for_width(38);
assert!(table.contains("request_id"));
assert!(table.contains("req_001"));
assert!(table.contains("1"));
assert!(table.contains("0%"));
assert!(
table.contains(" examples: req_001")
|| table.lines().any(|line| line.starts_with(" req_001")),
"{table}"
);
}
#[test]
fn test_format_json() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert("level".to_string(), make_string("INFO"));
discovery.observe_event(&fields);
let json = discovery.format_json();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["total_events"], 1);
assert_eq!(parsed["fields"][0]["name"], "level");
assert_eq!(parsed["fields"][0]["seen"], 1);
assert_eq!(parsed["fields"][0]["cardinality"]["exact"], true);
assert_eq!(parsed["fields"][0]["samples"][0], "INFO");
}
#[test]
fn test_empty_discovery() {
let discovery = FieldDiscovery::new();
let table = discovery.format_table();
assert!(table.contains("Scanned 0 events"));
assert!(table.contains("no fields found"));
}
#[test]
fn test_sample_limit() {
let mut profile = FieldProfile::new();
for i in 0..20 {
profile.observe(&make_string(&format!("value_{}", i)));
}
assert_eq!(profile.samples.len(), MAX_SAMPLES);
}
#[test]
fn test_reservoir_sees_rare_values() {
let mut total_rare_in_samples = 0;
let trials = 40;
for _ in 0..trials {
let mut profile = FieldProfile::new();
for _ in 0..1000 {
profile.observe(&make_string("common"));
}
for i in 0..20 {
profile.observe(&make_string(&format!("rare_{i}")));
}
total_rare_in_samples += profile
.samples
.iter()
.filter(|s| s.as_str().is_some_and(|s| s.starts_with("rare_")))
.count();
}
assert!(
total_rare_in_samples > trials * 4,
"reservoir should surface rare distinct values; got {total_rare_in_samples} across {trials} trials",
);
}
#[test]
fn test_scalar_to_json_preserves_scalar_types() {
assert_eq!(
scalar_to_json(&make_string("hello")),
serde_json::json!("hello")
);
assert_eq!(scalar_to_json(&make_int(42)), serde_json::json!(42));
assert_eq!(scalar_to_json(&make_float(2.5)), serde_json::json!(2.5));
assert_eq!(scalar_to_json(&make_bool(true)), serde_json::json!(true));
assert_eq!(scalar_to_json(&make_null()), serde_json::Value::Null);
assert_eq!(scalar_to_json(&Dynamic::from('x')), serde_json::json!("x"));
}
#[test]
fn test_scalar_to_json_preserves_escaped_strings() {
let value = make_string("line1\nline2\t\"quoted\"\\backslash");
assert_eq!(
scalar_to_json(&value),
serde_json::json!("line1\nline2\t\"quoted\"\\backslash")
);
}
#[test]
fn test_format_examples_renders_typed_samples_without_mutating_them() {
let mut profile = FieldProfile::new();
profile.observe(&make_string("hello"));
profile.observe(&make_int(42));
profile.observe(&make_bool(true));
let before = profile.samples.clone();
let examples = format_examples(&profile);
assert!(
examples.contains("hello"),
"string sample should render: {examples}"
);
assert!(
examples.contains("42"),
"int sample should render: {examples}"
);
assert!(
examples.contains("true"),
"bool sample should render: {examples}"
);
assert_eq!(
profile.samples, before,
"display formatting must not mutate samples"
);
}
#[test]
fn test_long_string_samples_not_truncated_in_json() {
let long = "x".repeat(MAX_SAMPLE_LEN + 40);
let mut profile = FieldProfile::new();
profile.observe(&make_string(&long));
assert_eq!(profile.samples.len(), 1);
assert_eq!(profile.samples[0], serde_json::json!(long));
let examples = format_examples(&profile);
assert!(
examples.chars().count() <= 60,
"table examples should remain display-truncated: {examples}"
);
}
#[test]
fn test_nested_flattening() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert(
"user".to_string(),
make_map(vec![("name", make_string("alice")), ("age", make_int(30))]),
);
fields.insert("level".to_string(), make_string("INFO"));
discovery.observe_event(&fields);
assert!(discovery.fields.contains_key("user"));
assert!(discovery.fields.contains_key("user.name"));
assert!(discovery.fields.contains_key("user.age"));
assert_eq!(discovery.fields["user.name"].seen_count, 1);
assert_eq!(discovery.fields["user.age"].type_counts[&FieldType::Int], 1);
}
#[test]
fn test_array_element_flattening() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert(
"roles".to_string(),
make_array(vec![make_string("admin"), make_string("dev")]),
);
discovery.observe_event(&fields);
assert!(discovery.fields.contains_key("roles"));
assert!(discovery.fields.contains_key("roles[]"));
assert_eq!(discovery.fields["roles[]"].seen_count, 2);
let (card, _) = discovery.fields["roles[]"].cardinality();
assert_eq!(card, 2);
}
#[test]
fn test_depth_limit() {
let deep = make_map(vec![(
"b",
make_map(vec![(
"c",
make_map(vec![("d", make_map(vec![("e", make_string("bottom"))]))]),
)]),
)]);
let mut fields = IndexMap::new();
fields.insert("a".to_string(), deep);
let mut discovery = FieldDiscovery::new();
discovery.observe_event(&fields);
assert!(discovery.fields.contains_key("a"));
assert!(discovery.fields.contains_key("a.b"));
assert!(discovery.fields.contains_key("a.b.c"));
assert!(!discovery.fields.contains_key("a.b.c.d"));
assert!(!discovery.fields.contains_key("a.b.c.d.e"));
let table = discovery.format_table();
assert!(
table.contains("Nested field flattening stopped at depth 3"),
"table should make depth cap explicit: {table}"
);
let json = discovery.format_json();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["flatten_depth_limit"], 3);
assert_eq!(parsed["flatten_depth_capped"], true);
}
#[test]
fn test_array_seen_exceeds_events_does_not_panic() {
let mut discovery = FieldDiscovery::new();
let mut fields = IndexMap::new();
fields.insert(
"tags".to_string(),
make_array(vec![make_string("a"), make_string("b"), make_string("c")]),
);
discovery.observe_event(&fields);
assert_eq!(discovery.fields["tags[]"].seen_count, 3);
assert_eq!(discovery.total_events, 1);
let table = discovery.format_table();
assert!(table.contains("tags[]"));
let json = discovery.format_json();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["truncated"], false);
assert_eq!(parsed["flatten_depth_limit"], 3);
assert_eq!(parsed["flatten_depth_capped"], false);
}
}