use std::collections::BTreeMap;
use std::path::Path;
use zenith_core::DataContext;
#[derive(Debug)]
pub struct DataInputError {
pub message: String,
}
impl DataInputError {
fn new(msg: impl Into<String>) -> Self {
Self {
message: msg.into(),
}
}
}
impl std::fmt::Display for DataInputError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.message)
}
}
pub fn load_data_context(path: &Path) -> Result<DataContext, DataInputError> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
match ext.as_str() {
"json" => load_from_json(path),
"csv" => load_from_csv(path),
other => Err(DataInputError::new(format!(
"--data: unsupported file extension '.{other}'; expected .json or .csv"
))),
}
}
fn load_from_json(path: &Path) -> Result<DataContext, DataInputError> {
let bytes = std::fs::read(path).map_err(|e| {
DataInputError::new(format!("--data: cannot read '{}': {}", path.display(), e))
})?;
let text = std::str::from_utf8(&bytes).map_err(|e| {
DataInputError::new(format!(
"--data: '{}' is not valid UTF-8: {}",
path.display(),
e
))
})?;
let value: serde_json::Value = serde_json::from_str(text).map_err(|e| {
DataInputError::new(format!(
"--data: '{}' is not valid JSON: {}",
path.display(),
e
))
})?;
let obj = match value {
serde_json::Value::Object(map) => map,
serde_json::Value::Array(arr) => {
let first = arr.into_iter().next().ok_or_else(|| {
DataInputError::new(format!(
"--data: '{}' is an empty JSON array; expected a non-empty array or object",
path.display()
))
})?;
match first {
serde_json::Value::Object(map) => map,
other => {
return Err(DataInputError::new(format!(
"--data: first element of '{}' is {} not an object",
path.display(),
json_kind_name(&other)
)));
}
}
}
other => {
return Err(DataInputError::new(format!(
"--data: '{}' contains {} not a JSON object or array",
path.display(),
json_kind_name(&other)
)));
}
};
let mut fields: BTreeMap<String, String> = BTreeMap::new();
let mut arrays: BTreeMap<String, Vec<String>> = BTreeMap::new();
flatten_object(&obj, String::new(), &mut fields, &mut arrays);
Ok(DataContext { fields, arrays })
}
fn flatten_object(
obj: &serde_json::Map<String, serde_json::Value>,
prefix: String,
out_fields: &mut BTreeMap<String, String>,
out_arrays: &mut BTreeMap<String, Vec<String>>,
) {
for (key, val) in obj {
let path = if prefix.is_empty() {
key.clone()
} else {
format!("{prefix}.{key}")
};
match val {
serde_json::Value::Object(inner) => {
flatten_object(inner, path, out_fields, out_arrays);
}
serde_json::Value::Array(arr) => {
let strings: Vec<String> = arr
.iter()
.filter_map(|e| match e {
serde_json::Value::Number(n) => Some(n.to_string()),
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Bool(b) => Some(b.to_string()),
serde_json::Value::Null => Some(String::new()),
_ => None,
})
.collect();
if !strings.is_empty() {
out_arrays.insert(path, strings);
}
}
serde_json::Value::String(s) => {
out_fields.insert(path, s.clone());
}
serde_json::Value::Number(n) => {
out_fields.insert(path, n.to_string());
}
serde_json::Value::Bool(b) => {
out_fields.insert(path, b.to_string());
}
serde_json::Value::Null => {
out_fields.insert(path, String::new());
}
}
}
}
fn json_kind_name(v: &serde_json::Value) -> &'static str {
match v {
serde_json::Value::Null => "null",
serde_json::Value::Bool(_) => "a boolean",
serde_json::Value::Number(_) => "a number",
serde_json::Value::String(_) => "a string",
serde_json::Value::Array(_) => "an array",
serde_json::Value::Object(_) => "an object",
}
}
fn load_from_csv(path: &Path) -> Result<DataContext, DataInputError> {
let bytes = std::fs::read(path).map_err(|e| {
DataInputError::new(format!("--data: cannot read '{}': {}", path.display(), e))
})?;
let text = std::str::from_utf8(&bytes).map_err(|e| {
DataInputError::new(format!(
"--data: '{}' is not valid UTF-8: {}",
path.display(),
e
))
})?;
let mut reader = csv::ReaderBuilder::new()
.flexible(true)
.from_reader(text.as_bytes());
let headers = reader
.headers()
.map_err(|e| {
DataInputError::new(format!(
"--data: CSV header error in '{}': {}",
path.display(),
e
))
})?
.clone();
let mut all_records: Vec<csv::StringRecord> = Vec::new();
for result in reader.records() {
let record = result.map_err(|e| {
DataInputError::new(format!(
"--data: CSV parse error in '{}': {}",
path.display(),
e
))
})?;
all_records.push(record);
}
if all_records.is_empty() {
return Err(DataInputError::new(format!(
"--data: '{}' has a header but no data rows",
path.display()
)));
}
let fields: BTreeMap<String, String> = all_records
.first()
.map(|first_record| {
headers
.iter()
.zip(first_record.iter())
.map(|(h, v)| (h.to_owned(), v.to_owned()))
.collect()
})
.unwrap_or_default();
let mut arrays: BTreeMap<String, Vec<String>> = BTreeMap::new();
for (col_idx, header) in headers.iter().enumerate() {
let column: Vec<String> = all_records
.iter()
.map(|rec| rec.get(col_idx).unwrap_or("").to_owned())
.collect();
arrays.insert(header.to_owned(), column);
}
Ok(DataContext { fields, arrays })
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
fn as_strs(arr: Option<&[String]>) -> Option<Vec<&str>> {
arr.map(|a| a.iter().map(String::as_str).collect())
}
fn write_temp(suffix: &str, content: &[u8]) -> (tempfile::TempDir, std::path::PathBuf) {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join(format!("data{suffix}"));
std::fs::File::create(&path)
.unwrap()
.write_all(content)
.unwrap();
(dir, path)
}
#[test]
fn json_flat_object_fields() {
let (_dir, path) = write_temp(".json", br#"{"name": "Alice", "age": 30, "active": true}"#);
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("name"), Some("Alice"));
assert_eq!(ctx.get("age"), Some("30"));
assert_eq!(ctx.get("active"), Some("true"));
}
#[test]
fn json_null_becomes_empty_string() {
let (_dir, path) = write_temp(".json", br#"{"x": null}"#);
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("x"), Some(""));
}
#[test]
fn json_nested_object_flattens() {
let (_dir, path) = write_temp(
".json",
br#"{"revenue": {"total": 42, "tax": 3.5}, "label": "Q1"}"#,
);
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("revenue.total"), Some("42"));
assert_eq!(ctx.get("revenue.tax"), Some("3.5"));
assert_eq!(ctx.get("label"), Some("Q1"));
assert_eq!(ctx.get("revenue"), None);
}
#[test]
fn json_nested_array_is_skipped() {
let (_dir, path) = write_temp(".json", br#"{"tags": [1, 2, 3], "val": "ok"}"#);
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("val"), Some("ok"));
assert_eq!(ctx.get("tags"), None);
}
#[test]
fn json_array_first_element_used() {
let (_dir, path) = write_temp(
".json",
br##"[{"color": "#ff0000"}, {"color": "#00ff00"}]"##,
);
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("color"), Some("#ff0000"));
}
#[test]
fn json_empty_array_is_error() {
let (_dir, path) = write_temp(".json", b"[]");
let err = load_data_context(&path).unwrap_err();
assert!(
err.message.contains("empty JSON array"),
"expected 'empty JSON array' in error; got: {}",
err.message
);
}
#[test]
fn json_array_non_object_first_element_is_error() {
let (_dir, path) = write_temp(".json", b"[42]");
let err = load_data_context(&path).unwrap_err();
assert!(
err.message.contains("not an object"),
"expected 'not an object' in error; got: {}",
err.message
);
}
#[test]
fn json_top_level_scalar_is_error() {
let (_dir, path) = write_temp(".json", b"\"hello\"");
let err = load_data_context(&path).unwrap_err();
assert!(
err.message.contains("not a JSON object or array"),
"expected 'not a JSON object or array' in error; got: {}",
err.message
);
}
#[test]
fn csv_header_and_first_row() {
let (_dir, path) = write_temp(".csv", b"name,city\nAlice,Wonderland\nBob,Nowhere");
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("name"), Some("Alice"));
assert_eq!(ctx.get("city"), Some("Wonderland"));
}
#[test]
fn csv_no_data_rows_is_error() {
let (_dir, path) = write_temp(".csv", b"name,city\n");
let err = load_data_context(&path).unwrap_err();
assert!(
err.message.contains("no data rows"),
"expected 'no data rows' in error; got: {}",
err.message
);
}
#[test]
fn unknown_extension_is_error() {
let (_dir, path) = write_temp(".toml", b"key = \"val\"");
let err = load_data_context(&path).unwrap_err();
assert!(
err.message.contains("unsupported file extension"),
"expected 'unsupported file extension' in error; got: {}",
err.message
);
}
#[test]
fn json_fields_are_sorted() {
let (_dir, path) = write_temp(".json", br#"{"z": "last", "a": "first", "m": "middle"}"#);
let ctx = load_data_context(&path).unwrap();
let keys: Vec<&str> = ctx.fields.keys().map(String::as_str).collect();
assert_eq!(keys, vec!["a", "m", "z"]);
}
#[test]
fn json_array_value_populates_arrays() {
let (_dir, path) = write_temp(".json", br#"{"sales": [12, 18, 15]}"#);
let ctx = load_data_context(&path).unwrap();
assert_eq!(
as_strs(ctx.get_array("sales")),
Some(vec!["12", "18", "15"]),
"numeric JSON array must populate arrays map"
);
assert_eq!(ctx.get("sales"), None);
}
#[test]
fn json_array_with_mixed_scalars() {
let (_dir, path) = write_temp(".json", br#"{"vals": [1, "two", true, null]}"#);
let ctx = load_data_context(&path).unwrap();
assert_eq!(
as_strs(ctx.get_array("vals")),
Some(vec!["1", "two", "true", ""]),
);
}
#[test]
fn json_empty_array_is_not_inserted() {
let (_dir, path) = write_temp(".json", br#"{"empty": [], "x": "y"}"#);
let ctx = load_data_context(&path).unwrap();
assert!(
ctx.get_array("empty").is_none(),
"empty array must not be inserted"
);
assert_eq!(ctx.get("x"), Some("y"));
}
#[test]
fn json_scalar_and_array_coexist() {
let (_dir, path) = write_temp(".json", br#"{"name": "Alice", "scores": [10, 20, 30]}"#);
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("name"), Some("Alice"));
assert_eq!(
as_strs(ctx.get_array("scores")),
Some(vec!["10", "20", "30"])
);
}
#[test]
fn csv_all_rows_populate_arrays() {
let (_dir, path) = write_temp(".csv", b"month,revenue\nJan,100\nFeb,200\nMar,150");
let ctx = load_data_context(&path).unwrap();
assert_eq!(ctx.get("month"), Some("Jan"));
assert_eq!(ctx.get("revenue"), Some("100"));
assert_eq!(
as_strs(ctx.get_array("month")),
Some(vec!["Jan", "Feb", "Mar"]),
);
assert_eq!(
as_strs(ctx.get_array("revenue")),
Some(vec!["100", "200", "150"]),
);
}
#[test]
fn csv_short_row_pads_with_empty_string() {
let (_dir, path) = write_temp(".csv", b"month,revenue\nJan,100\nFeb");
let ctx = load_data_context(&path).unwrap();
assert_eq!(
as_strs(ctx.get_array("revenue")),
Some(vec!["100", ""]),
"short CSV row must pad missing cells with empty string"
);
}
}