Skip to main content

dataprof_runtime/
profile_report.rs

1use dataprof_core::{ColumnProfile, DataSource, ExecutionMetadata};
2use dataprof_metrics::{QualityAssessment, QualityMetrics};
3
4/// Complete profiling report for a data source.
5///
6/// Contains column-level statistics, execution metadata, and an optional
7/// ISO 8000/25012 quality assessment. This is the primary output of all
8/// profiling operations (`Profiler::analyze_file`, `Profiler::analyze_source`,
9/// `Profiler::profile_stream`, etc.).
10#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
11pub struct ProfileReport {
12    /// Unique identifier for this report (UUID v4)
13    pub id: String,
14    /// Timestamp when the report was generated (ISO 8601 / RFC 3339)
15    pub timestamp: String,
16    /// Data source metadata (file, query, etc.)
17    pub data_source: DataSource,
18    /// Column-level profiling results
19    pub column_profiles: Vec<ColumnProfile>,
20    /// Execution metadata (timing, rows processed, truncation info, etc.)
21    #[serde(alias = "scan_info")]
22    pub execution: ExecutionMetadata,
23    /// Data quality assessment (optional — partial analysis may skip quality)
24    #[serde(
25        alias = "data_quality_metrics",
26        skip_serializing_if = "Option::is_none",
27        default,
28        deserialize_with = "deserialize_quality_compat"
29    )]
30    pub quality: Option<QualityAssessment>,
31}
32
33impl ProfileReport {
34    /// Create a new ProfileReport with auto-generated id and timestamp
35    pub fn new(
36        data_source: DataSource,
37        column_profiles: Vec<ColumnProfile>,
38        execution: ExecutionMetadata,
39        quality: Option<QualityAssessment>,
40    ) -> Self {
41        Self {
42            id: uuid::Uuid::new_v4().to_string(),
43            timestamp: chrono::Utc::now().to_rfc3339(),
44            data_source,
45            column_profiles,
46            execution,
47            quality,
48        }
49    }
50
51    /// Override the auto-generated ID (useful for deterministic caching/testing)
52    pub fn with_id(mut self, id: impl Into<String>) -> Self {
53        self.id = id.into();
54        self
55    }
56
57    /// Override the auto-generated timestamp
58    pub fn with_timestamp(mut self, timestamp: impl Into<String>) -> Self {
59        self.timestamp = timestamp.into();
60        self
61    }
62
63    /// Calculate overall quality score using ISO 8000/25012 metrics.
64    /// Returns `None` if quality metrics were not computed.
65    pub fn quality_score(&self) -> Option<f64> {
66        self.quality.as_ref().map(|q| q.score())
67    }
68
69    /// Get the data source identifier (for backwards compatibility)
70    pub fn source_identifier(&self) -> String {
71        self.data_source.identifier()
72    }
73}
74
75/// Custom deserializer that handles both legacy `DataQualityMetrics` (flat)
76/// and new `QualityAssessment` (wrapped with confidence) JSON formats.
77fn deserialize_quality_compat<'de, D>(
78    deserializer: D,
79) -> Result<Option<QualityAssessment>, D::Error>
80where
81    D: serde::Deserializer<'de>,
82{
83    use serde::Deserialize;
84
85    let value: Option<serde_json::Value> = Option::deserialize(deserializer)?;
86    match value {
87        None => Ok(None),
88        Some(v) => {
89            if v.get("metrics").is_some() && v.get("confidence").is_some() {
90                let assessment: QualityAssessment =
91                    serde_json::from_value(v).map_err(serde::de::Error::custom)?;
92                Ok(Some(assessment))
93            } else {
94                let metrics: QualityMetrics =
95                    serde_json::from_value(v).map_err(serde::de::Error::custom)?;
96                Ok(Some(QualityAssessment::exact(metrics)))
97            }
98        }
99    }
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105    use dataprof_core::FileFormat;
106    use dataprof_metrics::MetricConfidence;
107    use serde_json::json;
108
109    #[test]
110    fn test_profile_report_json_roundtrip() {
111        let report = ProfileReport::new(
112            DataSource::File {
113                path: "test.csv".to_string(),
114                format: FileFormat::Csv,
115                size_bytes: 1024,
116                modified_at: None,
117                parquet_metadata: None,
118            },
119            vec![],
120            ExecutionMetadata::new(100, 5, 50),
121            Some(QualityAssessment::exact(QualityMetrics::empty())),
122        );
123
124        let json = serde_json::to_string(&report).unwrap();
125        let deserialized: ProfileReport = serde_json::from_str(&json).unwrap();
126
127        assert_eq!(deserialized.id, report.id);
128        assert_eq!(deserialized.timestamp, report.timestamp);
129        assert_eq!(deserialized.source_identifier(), "test.csv");
130        assert_eq!(deserialized.execution.rows_processed, 100);
131        assert!(deserialized.quality.is_some());
132    }
133
134    #[test]
135    fn test_profile_report_without_quality() {
136        let report = ProfileReport::new(
137            DataSource::File {
138                path: "test.csv".to_string(),
139                format: FileFormat::Csv,
140                size_bytes: 1024,
141                modified_at: None,
142                parquet_metadata: None,
143            },
144            vec![],
145            ExecutionMetadata::new(100, 5, 50),
146            None,
147        );
148
149        let json = serde_json::to_string(&report).unwrap();
150        let deserialized: ProfileReport = serde_json::from_str(&json).unwrap();
151
152        assert!(deserialized.quality.is_none());
153        assert_eq!(deserialized.execution.rows_processed, 100);
154    }
155
156    #[test]
157    fn test_profile_report_deserializes_legacy_quality_metrics() {
158        let json = json!({
159            "id": "legacy-report",
160            "timestamp": "2026-05-22T10:00:00Z",
161            "data_source": {
162                "type": "file",
163                "path": "test.csv",
164                "format": "csv",
165                "size_bytes": 42
166            },
167            "column_profiles": [],
168            "scan_info": {
169                "rows_processed": 10,
170                "columns_detected": 2,
171                "scan_time_ms": 5,
172                "error_count": 0,
173                "source_exhausted": true,
174                "sampling_applied": false
175            },
176            "data_quality_metrics": {
177                "completeness": {
178                    "missing_values_ratio": 0.0,
179                    "complete_records_ratio": 100.0,
180                    "null_columns": []
181                }
182            }
183        });
184
185        let report: ProfileReport = serde_json::from_value(json).unwrap();
186
187        assert_eq!(report.id, "legacy-report");
188        assert_eq!(report.execution.rows_processed, 10);
189        let quality = report
190            .quality
191            .expect("expected legacy quality to deserialize");
192        assert!(matches!(quality.confidence, MetricConfidence::Exact));
193        assert!((quality.score() - 100.0).abs() < 0.01);
194    }
195}