use std::collections::{HashMap, HashSet};
use serde::Deserialize;
use crate::dataset::ImageId;
use crate::error::EvalError;
use crate::partition::{KeyKind, PartitionSpec, CROSS_SEPARATOR};
pub const MANIFEST_VERSION: &str = "1";
#[derive(Debug, Deserialize)]
struct ManifestDoc {
manifest_version: String,
key_kind: String,
rows: Vec<ManifestRow>,
}
#[derive(Debug, Deserialize)]
struct ManifestRow {
key: serde_json::Value,
#[serde(flatten)]
axes: serde_json::Map<String, serde_json::Value>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ManifestWarning {
UnknownKey {
key: String,
},
}
#[derive(Debug, Clone)]
pub struct ParsedManifest {
pub key_kind: KeyKind,
pub per_axis_image: HashMap<String, HashMap<String, HashSet<ImageId>>>,
pub per_label: HashMap<String, HashMap<String, String>>,
pub warnings: Vec<ManifestWarning>,
}
pub fn parse_manifest(
bytes: &[u8],
known_image_ids: &HashSet<ImageId>,
known_labels: &HashSet<String>,
) -> Result<ParsedManifest, EvalError> {
let doc: ManifestDoc = serde_json::from_slice(bytes)?;
if doc.manifest_version != MANIFEST_VERSION {
return Err(EvalError::InvalidConfig {
detail: format!(
"unsupported manifest_version {:?}; expected {:?}",
doc.manifest_version, MANIFEST_VERSION
),
});
}
let key_kind = match doc.key_kind.as_str() {
"image_id" => KeyKind::Image,
"result" => KeyKind::Result,
other => {
return Err(EvalError::InvalidConfig {
detail: format!("unknown key_kind {other:?}; expected \"image_id\" or \"result\""),
});
}
};
let mut axis_names: Option<Vec<String>> = None;
let mut per_axis_image: HashMap<String, HashMap<String, HashSet<ImageId>>> = HashMap::new();
let mut per_label: HashMap<String, HashMap<String, String>> = HashMap::new();
let mut warnings: Vec<ManifestWarning> = Vec::new();
for (row_idx, row) in doc.rows.iter().enumerate() {
let row_axes = collect_row_axes(row).map_err(|detail| EvalError::InvalidConfig {
detail: format!("row {row_idx}: {detail}"),
})?;
match &axis_names {
None => {
for name in &row_axes {
if name.contains(CROSS_SEPARATOR) {
return Err(EvalError::InvalidConfig {
detail: format!(
"manifest axis {name:?} contains the reserved separator \
{CROSS_SEPARATOR:?}; rename the column"
),
});
}
}
axis_names = Some(row_axes.clone());
}
Some(prev) => {
if prev != &row_axes {
return Err(EvalError::InvalidConfig {
detail: format!(
"row {row_idx} axes {row_axes:?} differ from first row {prev:?}; \
vernier rejects ragged manifests"
),
});
}
}
}
match key_kind {
KeyKind::Image => {
let id =
parse_image_id_key(&row.key).map_err(|detail| EvalError::InvalidConfig {
detail: format!("row {row_idx}: {detail}"),
})?;
if !known_image_ids.contains(&id) {
warnings.push(ManifestWarning::UnknownKey {
key: id.0.to_string(),
});
continue;
}
for axis in &row_axes {
let value = parse_axis_value(&row.axes[axis]).map_err(|detail| {
EvalError::InvalidConfig {
detail: format!("row {row_idx} axis {axis:?}: {detail}"),
}
})?;
per_axis_image
.entry(axis.clone())
.or_default()
.entry(value)
.or_default()
.insert(id);
}
}
KeyKind::Result => {
let label = parse_result_label_key(&row.key).map_err(|detail| {
EvalError::InvalidConfig {
detail: format!("row {row_idx}: {detail}"),
}
})?;
if !known_labels.contains(&label) {
warnings.push(ManifestWarning::UnknownKey { key: label });
continue;
}
let mut row_axis_values: HashMap<String, String> = HashMap::new();
for axis in &row_axes {
let value = parse_axis_value(&row.axes[axis]).map_err(|detail| {
EvalError::InvalidConfig {
detail: format!("row {row_idx} axis {axis:?}: {detail}"),
}
})?;
row_axis_values.insert(axis.clone(), value);
}
per_label.insert(label, row_axis_values);
}
}
}
Ok(ParsedManifest {
key_kind,
per_axis_image,
per_label,
warnings,
})
}
pub fn partition_spec_from_manifest(
bytes: &[u8],
image_id_to_idx: &HashMap<ImageId, usize>,
cross_axes: &[Vec<String>],
) -> Result<(PartitionSpec, Vec<ManifestWarning>), EvalError> {
let known_image_ids: HashSet<ImageId> = image_id_to_idx.keys().copied().collect();
let parsed = parse_manifest(bytes, &known_image_ids, &HashSet::new())?;
if !matches!(parsed.key_kind, KeyKind::Image) {
return Err(EvalError::InvalidConfig {
detail: "evaluate_partitioned consumes key_kind=\"image_id\" manifests; \
a key_kind=\"result\" manifest must be routed through \
vernier.aggregate / `vernier aggregate`"
.into(),
});
}
let spec = PartitionSpec::build(
parsed.key_kind,
&parsed.per_axis_image,
&known_image_ids,
image_id_to_idx,
cross_axes,
)?;
Ok((spec, parsed.warnings))
}
fn collect_row_axes(row: &ManifestRow) -> Result<Vec<String>, String> {
if row.key == serde_json::Value::Null {
return Err("missing `key` column".into());
}
let mut axes: Vec<String> = row.axes.keys().cloned().collect();
axes.sort();
if axes.is_empty() {
return Err("row has no axis columns beyond `key`".into());
}
Ok(axes)
}
fn parse_image_id_key(value: &serde_json::Value) -> Result<ImageId, String> {
match value {
serde_json::Value::Number(n) => {
if let Some(i) = n.as_i64() {
Ok(ImageId(i))
} else {
Err(format!("image_id key must be an integer; got {n:?}"))
}
}
serde_json::Value::String(s) => s
.parse::<i64>()
.map(ImageId)
.map_err(|_| format!("image_id key {s:?} is not an integer")),
other => Err(format!("image_id key must be an integer; got {other:?}")),
}
}
fn parse_result_label_key(value: &serde_json::Value) -> Result<String, String> {
match value {
serde_json::Value::String(s) => Ok(s.clone()),
other => Err(format!(
"result-keyed manifest needs a string key; got {other:?}"
)),
}
}
fn parse_axis_value(value: &serde_json::Value) -> Result<String, String> {
match value {
serde_json::Value::String(s) => Ok(s.clone()),
serde_json::Value::Number(_) => {
Err("axis values must be strings; numeric slicing is the Breakdown axis".into())
}
serde_json::Value::Bool(_) => Err("axis values must be strings; got a JSON boolean".into()),
other => Err(format!("axis values must be strings; got {other:?}")),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn known_ids(n: i64) -> HashSet<ImageId> {
(1..=n).map(ImageId).collect()
}
#[test]
fn parses_minimum_image_manifest() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "image_id",
"rows": [
{"key": 1, "weather": "fog"},
{"key": 2, "weather": "clear"}
]
}"#;
let parsed = parse_manifest(bytes, &known_ids(2), &HashSet::new()).unwrap();
assert_eq!(parsed.key_kind, KeyKind::Image);
assert!(parsed.warnings.is_empty());
let weather = parsed.per_axis_image.get("weather").unwrap();
assert_eq!(weather["fog"].len(), 1);
assert_eq!(weather["clear"].len(), 1);
}
#[test]
fn unknown_image_id_emits_warning_and_is_skipped() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "image_id",
"rows": [
{"key": 1, "weather": "fog"},
{"key": 99, "weather": "fog"}
]
}"#;
let parsed = parse_manifest(bytes, &known_ids(2), &HashSet::new()).unwrap();
assert_eq!(parsed.warnings.len(), 1);
assert!(matches!(
parsed.warnings[0],
ManifestWarning::UnknownKey { ref key } if key == "99"
));
let weather = parsed.per_axis_image.get("weather").unwrap();
assert_eq!(weather["fog"].len(), 1);
}
#[test]
fn ragged_axes_are_rejected() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "image_id",
"rows": [
{"key": 1, "weather": "fog", "time": "day"},
{"key": 2, "weather": "clear"}
]
}"#;
let err = parse_manifest(bytes, &known_ids(2), &HashSet::new()).unwrap_err();
assert!(matches!(err, EvalError::InvalidConfig { .. }));
}
#[test]
fn version_mismatch_is_rejected() {
let bytes = br#"{
"manifest_version": "2",
"key_kind": "image_id",
"rows": []
}"#;
let err = parse_manifest(bytes, &known_ids(0), &HashSet::new()).unwrap_err();
assert!(matches!(err, EvalError::InvalidConfig { .. }));
}
#[test]
fn unknown_key_kind_is_rejected() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "frame_id",
"rows": []
}"#;
let err = parse_manifest(bytes, &known_ids(0), &HashSet::new()).unwrap_err();
assert!(matches!(err, EvalError::InvalidConfig { .. }));
}
#[test]
fn axis_value_must_be_string() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "image_id",
"rows": [{"key": 1, "weather": 5}]
}"#;
let err = parse_manifest(bytes, &known_ids(1), &HashSet::new()).unwrap_err();
assert!(matches!(err, EvalError::InvalidConfig { .. }));
}
#[test]
fn result_keyed_manifest_collects_per_label_axis_values() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "result",
"rows": [
{"key": "clean", "weather": "clear"},
{"key": "fog_run", "weather": "fog"}
]
}"#;
let labels: HashSet<String> = ["clean", "fog_run"].iter().map(|s| s.to_string()).collect();
let parsed = parse_manifest(bytes, &HashSet::new(), &labels).unwrap();
assert_eq!(parsed.key_kind, KeyKind::Result);
assert_eq!(parsed.per_label["clean"]["weather"], "clear");
assert_eq!(parsed.per_label["fog_run"]["weather"], "fog");
}
#[test]
fn partition_spec_helper_rejects_result_keyed_manifest() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "result",
"rows": []
}"#;
let image_id_to_idx: HashMap<ImageId, usize> = HashMap::new();
let err = partition_spec_from_manifest(bytes, &image_id_to_idx, &[]).unwrap_err();
assert!(matches!(err, EvalError::InvalidConfig { .. }));
}
#[test]
fn partition_spec_helper_emits_unassigned_for_unmentioned_images() {
let bytes = br#"{
"manifest_version": "1",
"key_kind": "image_id",
"rows": [
{"key": 1, "weather": "fog"}
]
}"#;
let image_id_to_idx: HashMap<ImageId, usize> =
(1..=3).map(|i| (ImageId(i), (i - 1) as usize)).collect();
let (spec, _warnings) = partition_spec_from_manifest(bytes, &image_id_to_idx, &[]).unwrap();
let unassigned = spec
.slices
.iter()
.find(|s| s.axis == "weather" && s.value == crate::partition::UNASSIGNED)
.expect("unassigned bucket missing");
let mut ids: Vec<i64> = unassigned.image_ids.iter().map(|i| i.0).collect();
ids.sort();
assert_eq!(ids, vec![2, 3]);
}
}