use crate::error::{TrazaeoError, TrazaeoResult};
use crate::utils::Hash;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ProvenanceStartMode {
SourceCapture,
TransportCapture,
DatasetBootstrap,
DatasetIncremental,
}
impl ProvenanceStartMode {
pub fn as_str(self) -> &'static str {
match self {
Self::SourceCapture => "source_capture",
Self::TransportCapture => "transport_capture",
Self::DatasetBootstrap => "dataset_bootstrap",
Self::DatasetIncremental => "dataset_incremental",
}
}
pub fn parse(value: &str) -> Option<Self> {
match value {
"source_capture" => Some(Self::SourceCapture),
"transport_capture" => Some(Self::TransportCapture),
"dataset_bootstrap" => Some(Self::DatasetBootstrap),
"dataset_incremental" => Some(Self::DatasetIncremental),
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SourceFileEntry {
pub source_uri: String,
pub content_hash: String,
pub byte_length: u64,
pub observed_mtime: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SourceManifest {
pub manifest_id: String,
pub manifest_created_at: String,
pub source_dataset_id: String,
pub source_files: Vec<SourceFileEntry>,
pub source_file_count: usize,
pub source_root_hash: String,
}
pub fn canonical_source_manifest_bytes(source_files: &[SourceFileEntry]) -> Vec<u8> {
let mut normalized = source_files.to_vec();
normalized.sort_by(|a, b| {
a.source_uri
.cmp(&b.source_uri)
.then(a.content_hash.cmp(&b.content_hash))
.then(a.byte_length.cmp(&b.byte_length))
.then(a.observed_mtime.cmp(&b.observed_mtime))
});
let rows: Vec<String> = normalized
.iter()
.map(|entry| {
format!(
"{}|{}|{}|{}",
entry.source_uri,
entry.content_hash,
entry.byte_length,
entry.observed_mtime.clone().unwrap_or_default()
)
})
.collect();
rows.join("\n").into_bytes()
}
pub fn compute_source_root_hash(source_files: &[SourceFileEntry]) -> Hash {
let canonical = canonical_source_manifest_bytes(source_files);
let h = blake3::hash(&canonical);
Hash(*h.as_bytes())
}
pub fn validate_source_manifest(manifest: &SourceManifest) -> TrazaeoResult<()> {
if manifest.source_file_count != manifest.source_files.len() {
return Err(TrazaeoError::invalid_input(
"validate source manifest",
"source_file_count does not match source_files length",
));
}
let expected_root = hex::encode(compute_source_root_hash(&manifest.source_files).0);
if expected_root != manifest.source_root_hash {
return Err(TrazaeoError::invalid_input(
"validate source manifest",
"source_root_hash does not match canonical source_files",
));
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn files_variant_a() -> Vec<SourceFileEntry> {
vec![
SourceFileEntry {
source_uri: "s3://bucket/b.nc".to_string(),
content_hash: "h2".to_string(),
byte_length: 2,
observed_mtime: Some("2026-01-02T00:00:00Z".to_string()),
},
SourceFileEntry {
source_uri: "s3://bucket/a.nc".to_string(),
content_hash: "h1".to_string(),
byte_length: 1,
observed_mtime: Some("2026-01-01T00:00:00Z".to_string()),
},
]
}
#[test]
fn canonical_source_manifest_is_order_stable() {
let mut b = files_variant_a();
b.reverse();
assert_eq!(
canonical_source_manifest_bytes(&files_variant_a()),
canonical_source_manifest_bytes(&b)
);
}
#[test]
fn source_root_hash_is_order_stable() {
let mut b = files_variant_a();
b.reverse();
assert_eq!(
compute_source_root_hash(&files_variant_a()),
compute_source_root_hash(&b)
);
}
}