Skip to main content

datasynth_runtime/
label_export.rs

1//! Anomaly label export functionality.
2//!
3//! This module provides functions for exporting anomaly labels to various formats
4//! (CSV, JSON, JSON Lines) for ML training and audit purposes.
5
6use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12/// Error type for label export operations.
13#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15    #[error("IO error: {0}")]
16    Io(#[from] std::io::Error),
17    #[error("CSV error: {0}")]
18    Csv(#[from] csv::Error),
19    #[error("JSON error: {0}")]
20    Json(#[from] serde_json::Error),
21}
22
23/// Result type for label export operations.
24pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26/// Flattened anomaly label for CSV export.
27/// This structure converts nested fields to flat columns for easier CSV handling.
28#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30    // Core fields
31    pub anomaly_id: String,
32    pub anomaly_category: String,
33    pub anomaly_type: String,
34    pub document_id: String,
35    pub document_type: String,
36    pub company_code: String,
37    pub anomaly_date: String,
38    pub detection_timestamp: String,
39    pub confidence: f64,
40    pub severity: u8,
41    /// In-principle observability class (per_je_density / relational_graph / temporal /
42    /// memory_only) — which detection arm can surface this family. See `ObservabilityClass`.
43    pub observability: String,
44    pub description: String,
45    pub is_injected: bool,
46    pub monetary_impact: Option<String>,
47    pub related_entities: String, // JSON array as string
48    pub cluster_id: Option<String>,
49
50    // Provenance fields
51    pub original_document_hash: Option<String>,
52    pub injection_strategy: Option<String>,
53    pub structured_strategy_type: Option<String>,
54    pub structured_strategy_json: Option<String>,
55    pub causal_reason_type: Option<String>,
56    pub causal_reason_json: Option<String>,
57    pub parent_anomaly_id: Option<String>,
58    pub child_anomaly_ids: String, // JSON array as string
59    pub scenario_id: Option<String>,
60    pub run_id: Option<String>,
61    pub generation_seed: Option<u64>,
62
63    // Metadata as JSON
64    pub metadata_json: String,
65}
66
67fn serialize_or_warn<T: serde::Serialize>(value: &T, field_name: &str) -> String {
68    serde_json::to_string(value).unwrap_or_else(|e| {
69        tracing::warn!("Failed to serialize {} for label export: {}", field_name, e);
70        String::new()
71    })
72}
73
74impl From<&LabeledAnomaly> for FlatAnomalyLabel {
75    fn from(label: &LabeledAnomaly) -> Self {
76        Self {
77            anomaly_id: label.anomaly_id.clone(),
78            anomaly_category: label.anomaly_type.category().to_string(),
79            anomaly_type: label.anomaly_type.type_name(),
80            document_id: label.document_id.clone(),
81            document_type: label.document_type.clone(),
82            company_code: label.company_code.clone(),
83            anomaly_date: label.anomaly_date.to_string(),
84            detection_timestamp: label.detection_timestamp.to_string(),
85            confidence: label.confidence,
86            severity: label.severity,
87            observability: label.observability.as_str().to_string(),
88            description: label.description.clone(),
89            is_injected: label.is_injected,
90            monetary_impact: label.monetary_impact.map(|d| d.to_string()),
91            related_entities: serialize_or_warn(&label.related_entities, "related_entities"),
92            cluster_id: label.cluster_id.clone(),
93
94            // Provenance fields
95            original_document_hash: label.original_document_hash.clone(),
96            injection_strategy: label.injection_strategy.clone(),
97            structured_strategy_type: label
98                .structured_strategy
99                .as_ref()
100                .map(|s| s.strategy_type().to_string()),
101            structured_strategy_json: label
102                .structured_strategy
103                .as_ref()
104                .map(|s| serialize_or_warn(s, "structured_strategy")),
105            causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
106                datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
107                    "RandomRate".to_string()
108                }
109                datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
110                    "TemporalPattern".to_string()
111                }
112                datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
113                    "EntityTargeting".to_string()
114                }
115                datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
116                    "ClusterMembership".to_string()
117                }
118                datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
119                    "ScenarioStep".to_string()
120                }
121                datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
122                    "DataQualityProfile".to_string()
123                }
124                datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
125                    "MLTrainingBalance".to_string()
126                }
127            }),
128            causal_reason_json: label
129                .causal_reason
130                .as_ref()
131                .map(|r| serialize_or_warn(r, "causal_reason")),
132            parent_anomaly_id: label.parent_anomaly_id.clone(),
133            child_anomaly_ids: serialize_or_warn(&label.child_anomaly_ids, "child_anomaly_ids"),
134            scenario_id: label.scenario_id.clone(),
135            run_id: label.run_id.clone(),
136            generation_seed: label.generation_seed,
137
138            metadata_json: serialize_or_warn(&label.metadata, "metadata"),
139        }
140    }
141}
142
143/// Configuration for label export.
144#[derive(Debug, Clone)]
145pub struct LabelExportConfig {
146    /// Whether to include all provenance fields.
147    pub include_provenance: bool,
148    /// Whether to include metadata JSON.
149    pub include_metadata: bool,
150    /// Whether to pretty-print JSON output.
151    pub pretty_json: bool,
152}
153
154impl Default for LabelExportConfig {
155    fn default() -> Self {
156        Self {
157            include_provenance: true,
158            include_metadata: true,
159            pretty_json: true,
160        }
161    }
162}
163
164/// Exports anomaly labels to a CSV file.
165pub fn export_labels_csv(
166    labels: &[LabeledAnomaly],
167    path: &Path,
168    _config: &LabelExportConfig,
169) -> LabelExportResult<usize> {
170    let file = File::create(path)?;
171    let mut writer = csv::Writer::from_writer(BufWriter::with_capacity(256 * 1024, file));
172
173    for label in labels {
174        let flat: FlatAnomalyLabel = label.into();
175        writer.serialize(&flat)?;
176    }
177
178    writer.flush()?;
179    Ok(labels.len())
180}
181
182/// Exports anomaly labels to a JSON file (array format).
183pub fn export_labels_json(
184    labels: &[LabeledAnomaly],
185    path: &Path,
186    config: &LabelExportConfig,
187) -> LabelExportResult<usize> {
188    let file = File::create(path)?;
189    let writer = BufWriter::with_capacity(256 * 1024, file);
190
191    if config.pretty_json {
192        serde_json::to_writer_pretty(writer, labels)?;
193    } else {
194        serde_json::to_writer(writer, labels)?;
195    }
196
197    Ok(labels.len())
198}
199
200/// Exports anomaly labels to a JSON Lines file (one JSON object per line).
201pub fn export_labels_jsonl(
202    labels: &[LabeledAnomaly],
203    path: &Path,
204    _config: &LabelExportConfig,
205) -> LabelExportResult<usize> {
206    let file = File::create(path)?;
207    let mut writer = BufWriter::with_capacity(256 * 1024, file);
208
209    for label in labels {
210        let json = serde_json::to_string(label)?;
211        writeln!(writer, "{json}")?;
212    }
213
214    writer.flush()?;
215    Ok(labels.len())
216}
217
218/// Exports anomaly labels to multiple formats at once.
219pub fn export_labels_all_formats(
220    labels: &[LabeledAnomaly],
221    output_dir: &Path,
222    base_name: &str,
223    config: &LabelExportConfig,
224) -> LabelExportResult<Vec<(String, usize)>> {
225    std::fs::create_dir_all(output_dir)?;
226
227    let mut results = Vec::new();
228
229    // CSV
230    let csv_path = output_dir.join(format!("{base_name}.csv"));
231    let count = export_labels_csv(labels, &csv_path, config)?;
232    results.push((csv_path.display().to_string(), count));
233
234    // JSON
235    let json_path = output_dir.join(format!("{base_name}.json"));
236    let count = export_labels_json(labels, &json_path, config)?;
237    results.push((json_path.display().to_string(), count));
238
239    // JSONL
240    let jsonl_path = output_dir.join(format!("{base_name}.jsonl"));
241    let count = export_labels_jsonl(labels, &jsonl_path, config)?;
242    results.push((jsonl_path.display().to_string(), count));
243
244    Ok(results)
245}
246
247/// Summary statistics for exported labels.
248#[derive(Debug, Clone, Serialize)]
249pub struct LabelExportSummary {
250    /// Total labels exported.
251    pub total_labels: usize,
252    /// Labels by category.
253    pub by_category: std::collections::HashMap<String, usize>,
254    /// Labels by company.
255    pub by_company: std::collections::HashMap<String, usize>,
256    /// Labels with provenance.
257    pub with_provenance: usize,
258    /// Labels in scenarios.
259    pub in_scenarios: usize,
260    /// Labels in clusters.
261    pub in_clusters: usize,
262}
263
264impl LabelExportSummary {
265    /// Creates a summary from a list of labels.
266    pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
267        let mut by_category = std::collections::HashMap::new();
268        let mut by_company = std::collections::HashMap::new();
269        let mut with_provenance = 0;
270        let mut in_scenarios = 0;
271        let mut in_clusters = 0;
272
273        for label in labels {
274            *by_category
275                .entry(label.anomaly_type.category().to_string())
276                .or_insert(0) += 1;
277            *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
278
279            if label.causal_reason.is_some() || label.structured_strategy.is_some() {
280                with_provenance += 1;
281            }
282            if label.scenario_id.is_some() {
283                in_scenarios += 1;
284            }
285            if label.cluster_id.is_some() {
286                in_clusters += 1;
287            }
288        }
289
290        Self {
291            total_labels: labels.len(),
292            by_category,
293            by_company,
294            with_provenance,
295            in_scenarios,
296            in_clusters,
297        }
298    }
299
300    /// Writes the summary to a JSON file.
301    pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
302        let file = File::create(path)?;
303        let writer = BufWriter::with_capacity(256 * 1024, file);
304        serde_json::to_writer_pretty(writer, self)?;
305        Ok(())
306    }
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312    use chrono::NaiveDate;
313    use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
314    use tempfile::TempDir;
315
316    fn create_test_labels() -> Vec<LabeledAnomaly> {
317        vec![
318            LabeledAnomaly::new(
319                "ANO001".to_string(),
320                AnomalyType::Fraud(FraudType::SelfApproval),
321                "JE001".to_string(),
322                "JE".to_string(),
323                "1000".to_string(),
324                NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
325            )
326            .with_run_id("run-123")
327            .with_generation_seed(42)
328            .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
329            LabeledAnomaly::new(
330                "ANO002".to_string(),
331                AnomalyType::Fraud(FraudType::DuplicatePayment),
332                "JE002".to_string(),
333                "JE".to_string(),
334                "1000".to_string(),
335                NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
336            )
337            .with_cluster("cluster-1"),
338        ]
339    }
340
341    #[test]
342    fn test_export_csv() {
343        let temp_dir = TempDir::new().unwrap();
344        let labels = create_test_labels();
345        let config = LabelExportConfig::default();
346
347        let path = temp_dir.path().join("labels.csv");
348        let count = export_labels_csv(&labels, &path, &config).unwrap();
349
350        assert_eq!(count, 2);
351        assert!(path.exists());
352
353        let contents = std::fs::read_to_string(&path).unwrap();
354        assert!(contents.contains("ANO001"));
355        assert!(contents.contains("SelfApproval"));
356    }
357
358    #[test]
359    fn test_export_json() {
360        let temp_dir = TempDir::new().unwrap();
361        let labels = create_test_labels();
362        let config = LabelExportConfig::default();
363
364        let path = temp_dir.path().join("labels.json");
365        let count = export_labels_json(&labels, &path, &config).unwrap();
366
367        assert_eq!(count, 2);
368        assert!(path.exists());
369
370        let contents = std::fs::read_to_string(&path).unwrap();
371        assert!(contents.contains("ANO001"));
372        assert!(contents.contains("run-123"));
373    }
374
375    #[test]
376    fn test_export_jsonl() {
377        let temp_dir = TempDir::new().unwrap();
378        let labels = create_test_labels();
379        let config = LabelExportConfig::default();
380
381        let path = temp_dir.path().join("labels.jsonl");
382        let count = export_labels_jsonl(&labels, &path, &config).unwrap();
383
384        assert_eq!(count, 2);
385        assert!(path.exists());
386
387        let contents = std::fs::read_to_string(&path).unwrap();
388        let lines: Vec<&str> = contents.lines().collect();
389        assert_eq!(lines.len(), 2);
390    }
391
392    #[test]
393    fn test_export_all_formats() {
394        let temp_dir = TempDir::new().unwrap();
395        let labels = create_test_labels();
396        let config = LabelExportConfig::default();
397
398        let results =
399            export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
400
401        assert_eq!(results.len(), 3);
402        assert!(temp_dir.path().join("anomaly_labels.csv").exists());
403        assert!(temp_dir.path().join("anomaly_labels.json").exists());
404        assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
405    }
406
407    #[test]
408    fn test_label_export_summary() {
409        let labels = create_test_labels();
410        let summary = LabelExportSummary::from_labels(&labels);
411
412        assert_eq!(summary.total_labels, 2);
413        assert_eq!(summary.by_category.get("Fraud"), Some(&2));
414        assert_eq!(summary.with_provenance, 1);
415        assert_eq!(summary.in_clusters, 1);
416    }
417
418    #[test]
419    fn test_flat_label_conversion() {
420        let label = LabeledAnomaly::new(
421            "ANO001".to_string(),
422            AnomalyType::Fraud(FraudType::SelfApproval),
423            "JE001".to_string(),
424            "JE".to_string(),
425            "1000".to_string(),
426            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
427        )
428        .with_run_id("run-123")
429        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
430
431        let flat: FlatAnomalyLabel = (&label).into();
432
433        assert_eq!(flat.anomaly_id, "ANO001");
434        assert_eq!(flat.anomaly_category, "Fraud");
435        assert_eq!(flat.run_id, Some("run-123".to_string()));
436        assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
437    }
438}