use super::{ConflictPolicy, InferredKey, InferredSchema};
use crate::error::{NxsError, Result};
pub const SIGIL_INT: u8 = b'=';
pub const SIGIL_FLOAT: u8 = b'~';
pub const SIGIL_BOOL: u8 = b'?';
pub const SIGIL_TIME: u8 = b'@';
pub const SIGIL_HEX: u8 = b'<';
pub const SIGIL_NULL: u8 = b'^';
pub const SIGIL_STRING: u8 = b'"';
#[derive(Debug, Default, Clone)]
pub struct KeyState {
pub seen_int: bool,
pub seen_float: bool,
pub seen_bool: bool,
pub seen_time: bool,
pub seen_binary_hex: bool,
pub seen_string: bool,
pub seen_null: bool,
pub total_records_seen_in: usize,
pub present_count: usize,
pub first_sigil: Option<u8>,
}
impl KeyState {
pub fn observe(&mut self, raw: &str) {
self.total_records_seen_in += 1;
if raw.is_empty() {
self.seen_null = true;
return;
}
self.present_count += 1;
if raw.parse::<i64>().is_ok() {
self.seen_int = true;
self.first_sigil.get_or_insert(SIGIL_INT);
return;
}
if raw.parse::<f64>().is_ok() {
self.seen_float = true;
self.first_sigil.get_or_insert(SIGIL_FLOAT);
return;
}
if raw == "true" || raw == "false" {
self.seen_bool = true;
self.first_sigil.get_or_insert(SIGIL_BOOL);
return;
}
if is_time_like(raw) {
self.seen_time = true;
self.first_sigil.get_or_insert(SIGIL_TIME);
return;
}
if is_hex_like(raw) {
self.seen_binary_hex = true;
self.first_sigil.get_or_insert(SIGIL_HEX);
return;
}
self.seen_string = true;
self.first_sigil.get_or_insert(SIGIL_STRING);
}
pub fn resolve_sigil(&self, policy: ConflictPolicy) -> Result<u8> {
let type_count = [
self.seen_int,
self.seen_float,
self.seen_bool,
self.seen_time,
self.seen_binary_hex,
self.seen_string,
]
.iter()
.filter(|&&b| b)
.count();
if type_count > 1 {
return match policy {
ConflictPolicy::Error => Err(NxsError::ConvertSchemaConflict(
"mixed types observed for key".into(),
)),
ConflictPolicy::CoerceString => Ok(SIGIL_STRING),
ConflictPolicy::FirstWins => {
Ok(self.first_sigil.unwrap_or(SIGIL_STRING))
}
};
}
if self.seen_string {
return Ok(SIGIL_STRING);
}
if self.seen_int {
return Ok(SIGIL_INT);
}
if self.seen_float {
return Ok(SIGIL_FLOAT);
}
if self.seen_bool {
return Ok(SIGIL_BOOL);
}
if self.seen_time {
return Ok(SIGIL_TIME);
}
if self.seen_binary_hex {
return Ok(SIGIL_HEX);
}
Ok(SIGIL_NULL)
}
}
fn is_time_like(s: &str) -> bool {
if s.len() < 8 {
return false;
}
let has_sep = s.contains('-') || s.contains('T');
if !has_sep {
return false;
}
s.chars()
.all(|c| c.is_ascii_digit() || matches!(c, '-' | ':' | 'T' | 'Z' | '+' | '.'))
}
fn is_hex_like(s: &str) -> bool {
s.len() >= 16 && s.len() % 2 == 0 && s.chars().all(|c| c.is_ascii_hexdigit())
}
pub fn merge(acc: &mut InferredSchema, record: &[(String, String)]) {
for (key, value) in record {
let entry = acc.keys.iter().position(|k| &k.name == key);
if let Some(i) = entry {
if let Some(ks) = acc.key_states.get_mut(i) {
ks.observe(value);
}
} else {
let mut ks = KeyState::default();
ks.observe(value);
acc.keys.push(InferredKey {
name: key.clone(),
sigil: 0,
optional: false,
list_of: None,
});
acc.key_states.push(ks);
}
}
acc.total_records += 1;
}
pub fn finalize(mut acc: InferredSchema, policy: ConflictPolicy) -> Result<InferredSchema> {
for (key, state) in acc.keys.iter_mut().zip(acc.key_states.iter()) {
key.sigil = state.resolve_sigil(policy)?;
key.optional = state.present_count < acc.total_records;
}
Ok(acc)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::convert::ConflictPolicy;
fn observe_all(values: &[&str]) -> KeyState {
let mut ks = KeyState::default();
for v in values {
ks.observe(v);
}
ks
}
#[test]
fn test_infer_priority_order() {
let ks = observe_all(&["1", "2", "3"]);
assert_eq!(ks.resolve_sigil(ConflictPolicy::Error).unwrap(), SIGIL_INT);
let ks = observe_all(&["1", "2.5"]);
assert_eq!(
ks.resolve_sigil(ConflictPolicy::CoerceString).unwrap(),
SIGIL_STRING
);
let ks = observe_all(&["true", "false", "true"]);
assert_eq!(ks.resolve_sigil(ConflictPolicy::Error).unwrap(), SIGIL_BOOL);
let ks = observe_all(&["true", "0"]);
assert!(ks.resolve_sigil(ConflictPolicy::Error).is_err());
let ks = observe_all(&["2026-04-30", "2025-01-01"]);
assert_eq!(ks.resolve_sigil(ConflictPolicy::Error).unwrap(), SIGIL_TIME);
let ks = observe_all(&["deadbeefcafe0001", "0123456789abcdef"]);
assert_eq!(ks.resolve_sigil(ConflictPolicy::Error).unwrap(), SIGIL_HEX);
let ks = observe_all(&["1", "hello"]);
assert_eq!(
ks.resolve_sigil(ConflictPolicy::CoerceString).unwrap(),
SIGIL_STRING
);
let ks = observe_all(&["", ""]);
assert_eq!(ks.resolve_sigil(ConflictPolicy::Error).unwrap(), SIGIL_NULL);
}
#[test]
fn test_infer_missing_keys_marked_optional() {
let mut acc = InferredSchema::default();
merge(&mut acc, &[("email".into(), "a@b.com".into())]);
acc.total_records += 1;
let schema = finalize(acc, ConflictPolicy::Error).unwrap();
let email = schema.keys.iter().find(|k| k.name == "email").unwrap();
assert!(email.optional, "key absent in one record must be optional");
}
#[test]
fn test_infer_on_conflict_coerce_string() {
let mut ks = KeyState::default();
ks.observe("1"); ks.observe("hello"); let sigil = ks.resolve_sigil(ConflictPolicy::CoerceString).unwrap();
assert_eq!(sigil, SIGIL_STRING);
}
#[test]
fn test_infer_on_conflict_error() {
let mut ks = KeyState::default();
ks.observe("1"); ks.observe("hello"); let result = ks.resolve_sigil(ConflictPolicy::Error);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
NxsError::ConvertSchemaConflict(_)
));
}
#[test]
fn test_infer_first_wins_returns_first_observed_sigil() {
let mut ks = KeyState::default();
ks.observe("1"); ks.observe("hello"); assert_eq!(
ks.resolve_sigil(ConflictPolicy::FirstWins).unwrap(),
SIGIL_INT,
"FirstWins: first-seen type (int) must win"
);
let mut ks2 = KeyState::default();
ks2.observe("hello"); ks2.observe("1"); assert_eq!(
ks2.resolve_sigil(ConflictPolicy::FirstWins).unwrap(),
SIGIL_STRING,
"FirstWins: first-seen type (string) must win"
);
let mut ks3 = KeyState::default();
ks3.observe(""); ks3.observe("42"); ks3.observe("abc"); assert_eq!(
ks3.resolve_sigil(ConflictPolicy::FirstWins).unwrap(),
SIGIL_INT,
"FirstWins: null observations must not pollute first_sigil"
);
}
}