Skip to main content

pacha/data/
mod.rs

1//! Data registry types and operations.
2//!
3//! Provides dataset versioning, datasheets, and provenance tracking.
4
5mod datasheet;
6mod version;
7
8pub use datasheet::Datasheet;
9pub use version::DatasetVersion;
10
11use crate::storage::ContentAddress;
12use chrono::{DateTime, Utc};
13use serde::{Deserialize, Serialize};
14use uuid::Uuid;
15
16/// Unique identifier for a registered dataset.
17#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
18pub struct DatasetId(Uuid);
19
20impl DatasetId {
21    /// Create a new random dataset ID.
22    #[must_use]
23    pub fn new() -> Self {
24        Self(Uuid::new_v4())
25    }
26
27    /// Create from a UUID.
28    #[must_use]
29    pub fn from_uuid(uuid: Uuid) -> Self {
30        Self(uuid)
31    }
32
33    /// Get the underlying UUID.
34    #[must_use]
35    pub fn as_uuid(&self) -> &Uuid {
36        &self.0
37    }
38}
39
40impl Default for DatasetId {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46impl std::fmt::Display for DatasetId {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        write!(f, "{}", self.0)
49    }
50}
51
52impl std::str::FromStr for DatasetId {
53    type Err = uuid::Error;
54
55    fn from_str(s: &str) -> Result<Self, Self::Err> {
56        Ok(Self(Uuid::parse_str(s)?))
57    }
58}
59
60/// Reference to a dataset (name + version).
61#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
62pub struct DatasetReference {
63    /// Dataset name.
64    pub name: String,
65    /// Dataset version.
66    pub version: DatasetVersion,
67}
68
69impl DatasetReference {
70    /// Create a new dataset reference.
71    #[must_use]
72    pub fn new(name: impl Into<String>, version: DatasetVersion) -> Self {
73        Self { name: name.into(), version }
74    }
75}
76
77impl std::fmt::Display for DatasetReference {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        write!(f, "{}:{}", self.name, self.version)
80    }
81}
82
83/// A registered dataset in the registry.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct Dataset {
86    /// Unique identifier.
87    pub id: DatasetId,
88    /// Dataset name.
89    pub name: String,
90    /// Dataset version.
91    pub version: DatasetVersion,
92    /// Content address of the data.
93    pub content_address: ContentAddress,
94    /// Datasheet with metadata.
95    pub datasheet: Datasheet,
96    /// Registration timestamp.
97    pub created_at: DateTime<Utc>,
98}
99
100impl Dataset {
101    /// Create a reference to this dataset.
102    #[must_use]
103    pub fn reference(&self) -> DatasetReference {
104        DatasetReference::new(&self.name, self.version.clone())
105    }
106}
107
108/// Provenance record following W3C PROV-DM.
109#[derive(Debug, Clone, Serialize, Deserialize)]
110#[serde(tag = "type", rename_all = "snake_case")]
111pub enum ProvenanceRecord {
112    /// Data was derived from source.
113    WasDerivedFrom {
114        /// Derived dataset.
115        derived: DatasetId,
116        /// Source dataset.
117        source: DatasetId,
118        /// Transformation applied.
119        transformation: String,
120    },
121    /// Data was generated by activity.
122    WasGeneratedBy {
123        /// Generated data.
124        data: DatasetId,
125        /// Activity that generated it.
126        activity: String,
127        /// Timestamp.
128        timestamp: DateTime<Utc>,
129    },
130    /// Activity used data.
131    Used {
132        /// Activity.
133        activity: String,
134        /// Data used.
135        data: DatasetId,
136    },
137    /// Entity was attributed to agent.
138    WasAttributedTo {
139        /// Entity.
140        entity: DatasetId,
141        /// Agent (user/system).
142        agent: String,
143    },
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn test_dataset_id_generation() {
152        let id1 = DatasetId::new();
153        let id2 = DatasetId::new();
154        assert_ne!(id1, id2);
155    }
156
157    #[test]
158    fn test_dataset_id_from_str() {
159        let id = DatasetId::new();
160        let s = id.to_string();
161        let parsed: DatasetId = s.parse().unwrap();
162        assert_eq!(id, parsed);
163    }
164
165    #[test]
166    fn test_dataset_reference_display() {
167        let reference = DatasetReference::new("transactions", DatasetVersion::new(1, 2, 3));
168        assert_eq!(reference.to_string(), "transactions:1.2.3");
169    }
170
171    #[test]
172    fn test_provenance_serialization() {
173        let record = ProvenanceRecord::WasDerivedFrom {
174            derived: DatasetId::new(),
175            source: DatasetId::new(),
176            transformation: "normalize".to_string(),
177        };
178
179        let json = serde_json::to_string(&record).unwrap();
180        assert!(json.contains("was_derived_from"));
181    }
182}