Skip to main content

datasynth_runtime/
label_export.rs

1//! Anomaly label export functionality.
2//!
3//! This module provides functions for exporting anomaly labels to various formats
4//! (CSV, JSON, JSON Lines) for ML training and audit purposes.
5
6use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12/// Error type for label export operations.
13#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15    #[error("IO error: {0}")]
16    Io(#[from] std::io::Error),
17    #[error("CSV error: {0}")]
18    Csv(#[from] csv::Error),
19    #[error("JSON error: {0}")]
20    Json(#[from] serde_json::Error),
21}
22
23/// Result type for label export operations.
24pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26/// Flattened anomaly label for CSV export.
27/// This structure converts nested fields to flat columns for easier CSV handling.
28#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30    // Core fields
31    pub anomaly_id: String,
32    pub anomaly_category: String,
33    pub anomaly_type: String,
34    pub document_id: String,
35    pub document_type: String,
36    pub company_code: String,
37    pub anomaly_date: String,
38    pub detection_timestamp: String,
39    pub confidence: f64,
40    pub severity: u8,
41    pub description: String,
42    pub is_injected: bool,
43    pub monetary_impact: Option<String>,
44    pub related_entities: String, // JSON array as string
45    pub cluster_id: Option<String>,
46
47    // Provenance fields
48    pub original_document_hash: Option<String>,
49    pub injection_strategy: Option<String>,
50    pub structured_strategy_type: Option<String>,
51    pub structured_strategy_json: Option<String>,
52    pub causal_reason_type: Option<String>,
53    pub causal_reason_json: Option<String>,
54    pub parent_anomaly_id: Option<String>,
55    pub child_anomaly_ids: String, // JSON array as string
56    pub scenario_id: Option<String>,
57    pub run_id: Option<String>,
58    pub generation_seed: Option<u64>,
59
60    // Metadata as JSON
61    pub metadata_json: String,
62}
63
64impl From<&LabeledAnomaly> for FlatAnomalyLabel {
65    fn from(label: &LabeledAnomaly) -> Self {
66        Self {
67            anomaly_id: label.anomaly_id.clone(),
68            anomaly_category: label.anomaly_type.category().to_string(),
69            anomaly_type: label.anomaly_type.type_name(),
70            document_id: label.document_id.clone(),
71            document_type: label.document_type.clone(),
72            company_code: label.company_code.clone(),
73            anomaly_date: label.anomaly_date.to_string(),
74            detection_timestamp: label.detection_timestamp.to_string(),
75            confidence: label.confidence,
76            severity: label.severity,
77            description: label.description.clone(),
78            is_injected: label.is_injected,
79            monetary_impact: label.monetary_impact.map(|d| d.to_string()),
80            related_entities: serde_json::to_string(&label.related_entities).unwrap_or_default(),
81            cluster_id: label.cluster_id.clone(),
82
83            // Provenance fields
84            original_document_hash: label.original_document_hash.clone(),
85            injection_strategy: label.injection_strategy.clone(),
86            structured_strategy_type: label
87                .structured_strategy
88                .as_ref()
89                .map(|s| s.strategy_type().to_string()),
90            structured_strategy_json: label
91                .structured_strategy
92                .as_ref()
93                .map(|s| serde_json::to_string(s).unwrap_or_default()),
94            causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
95                datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
96                    "RandomRate".to_string()
97                }
98                datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
99                    "TemporalPattern".to_string()
100                }
101                datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
102                    "EntityTargeting".to_string()
103                }
104                datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
105                    "ClusterMembership".to_string()
106                }
107                datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
108                    "ScenarioStep".to_string()
109                }
110                datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
111                    "DataQualityProfile".to_string()
112                }
113                datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
114                    "MLTrainingBalance".to_string()
115                }
116            }),
117            causal_reason_json: label
118                .causal_reason
119                .as_ref()
120                .map(|r| serde_json::to_string(r).unwrap_or_default()),
121            parent_anomaly_id: label.parent_anomaly_id.clone(),
122            child_anomaly_ids: serde_json::to_string(&label.child_anomaly_ids).unwrap_or_default(),
123            scenario_id: label.scenario_id.clone(),
124            run_id: label.run_id.clone(),
125            generation_seed: label.generation_seed,
126
127            metadata_json: serde_json::to_string(&label.metadata).unwrap_or_default(),
128        }
129    }
130}
131
132/// Configuration for label export.
133#[derive(Debug, Clone)]
134pub struct LabelExportConfig {
135    /// Whether to include all provenance fields.
136    pub include_provenance: bool,
137    /// Whether to include metadata JSON.
138    pub include_metadata: bool,
139    /// Whether to pretty-print JSON output.
140    pub pretty_json: bool,
141}
142
143impl Default for LabelExportConfig {
144    fn default() -> Self {
145        Self {
146            include_provenance: true,
147            include_metadata: true,
148            pretty_json: true,
149        }
150    }
151}
152
153/// Exports anomaly labels to a CSV file.
154pub fn export_labels_csv(
155    labels: &[LabeledAnomaly],
156    path: &Path,
157    _config: &LabelExportConfig,
158) -> LabelExportResult<usize> {
159    let file = File::create(path)?;
160    let mut writer = csv::Writer::from_writer(BufWriter::new(file));
161
162    for label in labels {
163        let flat: FlatAnomalyLabel = label.into();
164        writer.serialize(&flat)?;
165    }
166
167    writer.flush()?;
168    Ok(labels.len())
169}
170
171/// Exports anomaly labels to a JSON file (array format).
172pub fn export_labels_json(
173    labels: &[LabeledAnomaly],
174    path: &Path,
175    config: &LabelExportConfig,
176) -> LabelExportResult<usize> {
177    let file = File::create(path)?;
178    let writer = BufWriter::new(file);
179
180    if config.pretty_json {
181        serde_json::to_writer_pretty(writer, labels)?;
182    } else {
183        serde_json::to_writer(writer, labels)?;
184    }
185
186    Ok(labels.len())
187}
188
189/// Exports anomaly labels to a JSON Lines file (one JSON object per line).
190pub fn export_labels_jsonl(
191    labels: &[LabeledAnomaly],
192    path: &Path,
193    _config: &LabelExportConfig,
194) -> LabelExportResult<usize> {
195    let file = File::create(path)?;
196    let mut writer = BufWriter::new(file);
197
198    for label in labels {
199        let json = serde_json::to_string(label)?;
200        writeln!(writer, "{}", json)?;
201    }
202
203    writer.flush()?;
204    Ok(labels.len())
205}
206
207/// Exports anomaly labels to multiple formats at once.
208pub fn export_labels_all_formats(
209    labels: &[LabeledAnomaly],
210    output_dir: &Path,
211    base_name: &str,
212    config: &LabelExportConfig,
213) -> LabelExportResult<Vec<(String, usize)>> {
214    std::fs::create_dir_all(output_dir)?;
215
216    let mut results = Vec::new();
217
218    // CSV
219    let csv_path = output_dir.join(format!("{}.csv", base_name));
220    let count = export_labels_csv(labels, &csv_path, config)?;
221    results.push((csv_path.display().to_string(), count));
222
223    // JSON
224    let json_path = output_dir.join(format!("{}.json", base_name));
225    let count = export_labels_json(labels, &json_path, config)?;
226    results.push((json_path.display().to_string(), count));
227
228    // JSONL
229    let jsonl_path = output_dir.join(format!("{}.jsonl", base_name));
230    let count = export_labels_jsonl(labels, &jsonl_path, config)?;
231    results.push((jsonl_path.display().to_string(), count));
232
233    Ok(results)
234}
235
236/// Summary statistics for exported labels.
237#[derive(Debug, Clone, Serialize)]
238pub struct LabelExportSummary {
239    /// Total labels exported.
240    pub total_labels: usize,
241    /// Labels by category.
242    pub by_category: std::collections::HashMap<String, usize>,
243    /// Labels by company.
244    pub by_company: std::collections::HashMap<String, usize>,
245    /// Labels with provenance.
246    pub with_provenance: usize,
247    /// Labels in scenarios.
248    pub in_scenarios: usize,
249    /// Labels in clusters.
250    pub in_clusters: usize,
251}
252
253impl LabelExportSummary {
254    /// Creates a summary from a list of labels.
255    pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
256        let mut by_category = std::collections::HashMap::new();
257        let mut by_company = std::collections::HashMap::new();
258        let mut with_provenance = 0;
259        let mut in_scenarios = 0;
260        let mut in_clusters = 0;
261
262        for label in labels {
263            *by_category
264                .entry(label.anomaly_type.category().to_string())
265                .or_insert(0) += 1;
266            *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
267
268            if label.causal_reason.is_some() || label.structured_strategy.is_some() {
269                with_provenance += 1;
270            }
271            if label.scenario_id.is_some() {
272                in_scenarios += 1;
273            }
274            if label.cluster_id.is_some() {
275                in_clusters += 1;
276            }
277        }
278
279        Self {
280            total_labels: labels.len(),
281            by_category,
282            by_company,
283            with_provenance,
284            in_scenarios,
285            in_clusters,
286        }
287    }
288
289    /// Writes the summary to a JSON file.
290    pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
291        let file = File::create(path)?;
292        let writer = BufWriter::new(file);
293        serde_json::to_writer_pretty(writer, self)?;
294        Ok(())
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301    use chrono::NaiveDate;
302    use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
303    use tempfile::TempDir;
304
305    fn create_test_labels() -> Vec<LabeledAnomaly> {
306        vec![
307            LabeledAnomaly::new(
308                "ANO001".to_string(),
309                AnomalyType::Fraud(FraudType::SelfApproval),
310                "JE001".to_string(),
311                "JE".to_string(),
312                "1000".to_string(),
313                NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
314            )
315            .with_run_id("run-123")
316            .with_generation_seed(42)
317            .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
318            LabeledAnomaly::new(
319                "ANO002".to_string(),
320                AnomalyType::Fraud(FraudType::DuplicatePayment),
321                "JE002".to_string(),
322                "JE".to_string(),
323                "1000".to_string(),
324                NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
325            )
326            .with_cluster("cluster-1"),
327        ]
328    }
329
330    #[test]
331    fn test_export_csv() {
332        let temp_dir = TempDir::new().unwrap();
333        let labels = create_test_labels();
334        let config = LabelExportConfig::default();
335
336        let path = temp_dir.path().join("labels.csv");
337        let count = export_labels_csv(&labels, &path, &config).unwrap();
338
339        assert_eq!(count, 2);
340        assert!(path.exists());
341
342        let contents = std::fs::read_to_string(&path).unwrap();
343        assert!(contents.contains("ANO001"));
344        assert!(contents.contains("SelfApproval"));
345    }
346
347    #[test]
348    fn test_export_json() {
349        let temp_dir = TempDir::new().unwrap();
350        let labels = create_test_labels();
351        let config = LabelExportConfig::default();
352
353        let path = temp_dir.path().join("labels.json");
354        let count = export_labels_json(&labels, &path, &config).unwrap();
355
356        assert_eq!(count, 2);
357        assert!(path.exists());
358
359        let contents = std::fs::read_to_string(&path).unwrap();
360        assert!(contents.contains("ANO001"));
361        assert!(contents.contains("run-123"));
362    }
363
364    #[test]
365    fn test_export_jsonl() {
366        let temp_dir = TempDir::new().unwrap();
367        let labels = create_test_labels();
368        let config = LabelExportConfig::default();
369
370        let path = temp_dir.path().join("labels.jsonl");
371        let count = export_labels_jsonl(&labels, &path, &config).unwrap();
372
373        assert_eq!(count, 2);
374        assert!(path.exists());
375
376        let contents = std::fs::read_to_string(&path).unwrap();
377        let lines: Vec<&str> = contents.lines().collect();
378        assert_eq!(lines.len(), 2);
379    }
380
381    #[test]
382    fn test_export_all_formats() {
383        let temp_dir = TempDir::new().unwrap();
384        let labels = create_test_labels();
385        let config = LabelExportConfig::default();
386
387        let results =
388            export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
389
390        assert_eq!(results.len(), 3);
391        assert!(temp_dir.path().join("anomaly_labels.csv").exists());
392        assert!(temp_dir.path().join("anomaly_labels.json").exists());
393        assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
394    }
395
396    #[test]
397    fn test_label_export_summary() {
398        let labels = create_test_labels();
399        let summary = LabelExportSummary::from_labels(&labels);
400
401        assert_eq!(summary.total_labels, 2);
402        assert_eq!(summary.by_category.get("Fraud"), Some(&2));
403        assert_eq!(summary.with_provenance, 1);
404        assert_eq!(summary.in_clusters, 1);
405    }
406
407    #[test]
408    fn test_flat_label_conversion() {
409        let label = LabeledAnomaly::new(
410            "ANO001".to_string(),
411            AnomalyType::Fraud(FraudType::SelfApproval),
412            "JE001".to_string(),
413            "JE".to_string(),
414            "1000".to_string(),
415            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
416        )
417        .with_run_id("run-123")
418        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
419
420        let flat: FlatAnomalyLabel = (&label).into();
421
422        assert_eq!(flat.anomaly_id, "ANO001");
423        assert_eq!(flat.anomaly_category, "Fraud");
424        assert_eq!(flat.run_id, Some("run-123".to_string()));
425        assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
426    }
427}