pacha 0.2.5

Model, Data and Recipe Registry with full lineage tracking
Documentation
//! Data registry types and operations.
//!
//! Provides dataset versioning, datasheets, and provenance tracking.

mod datasheet;
mod version;

pub use datasheet::Datasheet;
pub use version::DatasetVersion;

use crate::storage::ContentAddress;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;

/// Unique identifier for a registered dataset.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct DatasetId(Uuid);

impl DatasetId {
    /// Create a new random dataset ID.
    #[must_use]
    pub fn new() -> Self {
        Self(Uuid::new_v4())
    }

    /// Create from a UUID.
    #[must_use]
    pub fn from_uuid(uuid: Uuid) -> Self {
        Self(uuid)
    }

    /// Get the underlying UUID.
    #[must_use]
    pub fn as_uuid(&self) -> &Uuid {
        &self.0
    }
}

impl Default for DatasetId {
    fn default() -> Self {
        Self::new()
    }
}

impl std::fmt::Display for DatasetId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
}

impl std::str::FromStr for DatasetId {
    type Err = uuid::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Ok(Self(Uuid::parse_str(s)?))
    }
}

/// Reference to a dataset (name + version).
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DatasetReference {
    /// Dataset name.
    pub name: String,
    /// Dataset version.
    pub version: DatasetVersion,
}

impl DatasetReference {
    /// Create a new dataset reference.
    #[must_use]
    pub fn new(name: impl Into<String>, version: DatasetVersion) -> Self {
        Self {
            name: name.into(),
            version,
        }
    }
}

impl std::fmt::Display for DatasetReference {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}:{}", self.name, self.version)
    }
}

/// A registered dataset in the registry.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dataset {
    /// Unique identifier.
    pub id: DatasetId,
    /// Dataset name.
    pub name: String,
    /// Dataset version.
    pub version: DatasetVersion,
    /// Content address of the data.
    pub content_address: ContentAddress,
    /// Datasheet with metadata.
    pub datasheet: Datasheet,
    /// Registration timestamp.
    pub created_at: DateTime<Utc>,
}

impl Dataset {
    /// Create a reference to this dataset.
    #[must_use]
    pub fn reference(&self) -> DatasetReference {
        DatasetReference::new(&self.name, self.version.clone())
    }
}

/// Provenance record following W3C PROV-DM.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ProvenanceRecord {
    /// Data was derived from source.
    WasDerivedFrom {
        /// Derived dataset.
        derived: DatasetId,
        /// Source dataset.
        source: DatasetId,
        /// Transformation applied.
        transformation: String,
    },
    /// Data was generated by activity.
    WasGeneratedBy {
        /// Generated data.
        data: DatasetId,
        /// Activity that generated it.
        activity: String,
        /// Timestamp.
        timestamp: DateTime<Utc>,
    },
    /// Activity used data.
    Used {
        /// Activity.
        activity: String,
        /// Data used.
        data: DatasetId,
    },
    /// Entity was attributed to agent.
    WasAttributedTo {
        /// Entity.
        entity: DatasetId,
        /// Agent (user/system).
        agent: String,
    },
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_dataset_id_generation() {
        let id1 = DatasetId::new();
        let id2 = DatasetId::new();
        assert_ne!(id1, id2);
    }

    #[test]
    fn test_dataset_id_from_str() {
        let id = DatasetId::new();
        let s = id.to_string();
        let parsed: DatasetId = s.parse().unwrap();
        assert_eq!(id, parsed);
    }

    #[test]
    fn test_dataset_reference_display() {
        let reference = DatasetReference::new("transactions", DatasetVersion::new(1, 2, 3));
        assert_eq!(reference.to_string(), "transactions:1.2.3");
    }

    #[test]
    fn test_provenance_serialization() {
        let record = ProvenanceRecord::WasDerivedFrom {
            derived: DatasetId::new(),
            source: DatasetId::new(),
            transformation: "normalize".to_string(),
        };

        let json = serde_json::to_string(&record).unwrap();
        assert!(json.contains("was_derived_from"));
    }
}