Skip to main content

datasynth_runtime/
label_export.rs

1//! Anomaly label export functionality.
2//!
3//! This module provides functions for exporting anomaly labels to various formats
4//! (CSV, JSON, JSON Lines) for ML training and audit purposes.
5
6use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12/// Error type for label export operations.
13#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15    #[error("IO error: {0}")]
16    Io(#[from] std::io::Error),
17    #[error("CSV error: {0}")]
18    Csv(#[from] csv::Error),
19    #[error("JSON error: {0}")]
20    Json(#[from] serde_json::Error),
21}
22
23/// Result type for label export operations.
24pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26/// Flattened anomaly label for CSV export.
27/// This structure converts nested fields to flat columns for easier CSV handling.
28#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30    // Core fields
31    pub anomaly_id: String,
32    pub anomaly_category: String,
33    pub anomaly_type: String,
34    pub document_id: String,
35    pub document_type: String,
36    pub company_code: String,
37    pub anomaly_date: String,
38    pub detection_timestamp: String,
39    pub confidence: f64,
40    pub severity: u8,
41    pub description: String,
42    pub is_injected: bool,
43    pub monetary_impact: Option<String>,
44    pub related_entities: String, // JSON array as string
45    pub cluster_id: Option<String>,
46
47    // Provenance fields
48    pub original_document_hash: Option<String>,
49    pub injection_strategy: Option<String>,
50    pub structured_strategy_type: Option<String>,
51    pub structured_strategy_json: Option<String>,
52    pub causal_reason_type: Option<String>,
53    pub causal_reason_json: Option<String>,
54    pub parent_anomaly_id: Option<String>,
55    pub child_anomaly_ids: String, // JSON array as string
56    pub scenario_id: Option<String>,
57    pub run_id: Option<String>,
58    pub generation_seed: Option<u64>,
59
60    // Metadata as JSON
61    pub metadata_json: String,
62}
63
64fn serialize_or_warn<T: serde::Serialize>(value: &T, field_name: &str) -> String {
65    serde_json::to_string(value).unwrap_or_else(|e| {
66        tracing::warn!("Failed to serialize {} for label export: {}", field_name, e);
67        String::new()
68    })
69}
70
71impl From<&LabeledAnomaly> for FlatAnomalyLabel {
72    fn from(label: &LabeledAnomaly) -> Self {
73        Self {
74            anomaly_id: label.anomaly_id.clone(),
75            anomaly_category: label.anomaly_type.category().to_string(),
76            anomaly_type: label.anomaly_type.type_name(),
77            document_id: label.document_id.clone(),
78            document_type: label.document_type.clone(),
79            company_code: label.company_code.clone(),
80            anomaly_date: label.anomaly_date.to_string(),
81            detection_timestamp: label.detection_timestamp.to_string(),
82            confidence: label.confidence,
83            severity: label.severity,
84            description: label.description.clone(),
85            is_injected: label.is_injected,
86            monetary_impact: label.monetary_impact.map(|d| d.to_string()),
87            related_entities: serialize_or_warn(&label.related_entities, "related_entities"),
88            cluster_id: label.cluster_id.clone(),
89
90            // Provenance fields
91            original_document_hash: label.original_document_hash.clone(),
92            injection_strategy: label.injection_strategy.clone(),
93            structured_strategy_type: label
94                .structured_strategy
95                .as_ref()
96                .map(|s| s.strategy_type().to_string()),
97            structured_strategy_json: label
98                .structured_strategy
99                .as_ref()
100                .map(|s| serialize_or_warn(s, "structured_strategy")),
101            causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
102                datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
103                    "RandomRate".to_string()
104                }
105                datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
106                    "TemporalPattern".to_string()
107                }
108                datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
109                    "EntityTargeting".to_string()
110                }
111                datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
112                    "ClusterMembership".to_string()
113                }
114                datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
115                    "ScenarioStep".to_string()
116                }
117                datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
118                    "DataQualityProfile".to_string()
119                }
120                datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
121                    "MLTrainingBalance".to_string()
122                }
123            }),
124            causal_reason_json: label
125                .causal_reason
126                .as_ref()
127                .map(|r| serialize_or_warn(r, "causal_reason")),
128            parent_anomaly_id: label.parent_anomaly_id.clone(),
129            child_anomaly_ids: serialize_or_warn(&label.child_anomaly_ids, "child_anomaly_ids"),
130            scenario_id: label.scenario_id.clone(),
131            run_id: label.run_id.clone(),
132            generation_seed: label.generation_seed,
133
134            metadata_json: serialize_or_warn(&label.metadata, "metadata"),
135        }
136    }
137}
138
139/// Configuration for label export.
140#[derive(Debug, Clone)]
141pub struct LabelExportConfig {
142    /// Whether to include all provenance fields.
143    pub include_provenance: bool,
144    /// Whether to include metadata JSON.
145    pub include_metadata: bool,
146    /// Whether to pretty-print JSON output.
147    pub pretty_json: bool,
148}
149
150impl Default for LabelExportConfig {
151    fn default() -> Self {
152        Self {
153            include_provenance: true,
154            include_metadata: true,
155            pretty_json: true,
156        }
157    }
158}
159
160/// Exports anomaly labels to a CSV file.
161pub fn export_labels_csv(
162    labels: &[LabeledAnomaly],
163    path: &Path,
164    _config: &LabelExportConfig,
165) -> LabelExportResult<usize> {
166    let file = File::create(path)?;
167    let mut writer = csv::Writer::from_writer(BufWriter::with_capacity(256 * 1024, file));
168
169    for label in labels {
170        let flat: FlatAnomalyLabel = label.into();
171        writer.serialize(&flat)?;
172    }
173
174    writer.flush()?;
175    Ok(labels.len())
176}
177
178/// Exports anomaly labels to a JSON file (array format).
179pub fn export_labels_json(
180    labels: &[LabeledAnomaly],
181    path: &Path,
182    config: &LabelExportConfig,
183) -> LabelExportResult<usize> {
184    let file = File::create(path)?;
185    let writer = BufWriter::with_capacity(256 * 1024, file);
186
187    if config.pretty_json {
188        serde_json::to_writer_pretty(writer, labels)?;
189    } else {
190        serde_json::to_writer(writer, labels)?;
191    }
192
193    Ok(labels.len())
194}
195
196/// Exports anomaly labels to a JSON Lines file (one JSON object per line).
197pub fn export_labels_jsonl(
198    labels: &[LabeledAnomaly],
199    path: &Path,
200    _config: &LabelExportConfig,
201) -> LabelExportResult<usize> {
202    let file = File::create(path)?;
203    let mut writer = BufWriter::with_capacity(256 * 1024, file);
204
205    for label in labels {
206        let json = serde_json::to_string(label)?;
207        writeln!(writer, "{}", json)?;
208    }
209
210    writer.flush()?;
211    Ok(labels.len())
212}
213
214/// Exports anomaly labels to multiple formats at once.
215pub fn export_labels_all_formats(
216    labels: &[LabeledAnomaly],
217    output_dir: &Path,
218    base_name: &str,
219    config: &LabelExportConfig,
220) -> LabelExportResult<Vec<(String, usize)>> {
221    std::fs::create_dir_all(output_dir)?;
222
223    let mut results = Vec::new();
224
225    // CSV
226    let csv_path = output_dir.join(format!("{}.csv", base_name));
227    let count = export_labels_csv(labels, &csv_path, config)?;
228    results.push((csv_path.display().to_string(), count));
229
230    // JSON
231    let json_path = output_dir.join(format!("{}.json", base_name));
232    let count = export_labels_json(labels, &json_path, config)?;
233    results.push((json_path.display().to_string(), count));
234
235    // JSONL
236    let jsonl_path = output_dir.join(format!("{}.jsonl", base_name));
237    let count = export_labels_jsonl(labels, &jsonl_path, config)?;
238    results.push((jsonl_path.display().to_string(), count));
239
240    Ok(results)
241}
242
243/// Summary statistics for exported labels.
244#[derive(Debug, Clone, Serialize)]
245pub struct LabelExportSummary {
246    /// Total labels exported.
247    pub total_labels: usize,
248    /// Labels by category.
249    pub by_category: std::collections::HashMap<String, usize>,
250    /// Labels by company.
251    pub by_company: std::collections::HashMap<String, usize>,
252    /// Labels with provenance.
253    pub with_provenance: usize,
254    /// Labels in scenarios.
255    pub in_scenarios: usize,
256    /// Labels in clusters.
257    pub in_clusters: usize,
258}
259
260impl LabelExportSummary {
261    /// Creates a summary from a list of labels.
262    pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
263        let mut by_category = std::collections::HashMap::new();
264        let mut by_company = std::collections::HashMap::new();
265        let mut with_provenance = 0;
266        let mut in_scenarios = 0;
267        let mut in_clusters = 0;
268
269        for label in labels {
270            *by_category
271                .entry(label.anomaly_type.category().to_string())
272                .or_insert(0) += 1;
273            *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
274
275            if label.causal_reason.is_some() || label.structured_strategy.is_some() {
276                with_provenance += 1;
277            }
278            if label.scenario_id.is_some() {
279                in_scenarios += 1;
280            }
281            if label.cluster_id.is_some() {
282                in_clusters += 1;
283            }
284        }
285
286        Self {
287            total_labels: labels.len(),
288            by_category,
289            by_company,
290            with_provenance,
291            in_scenarios,
292            in_clusters,
293        }
294    }
295
296    /// Writes the summary to a JSON file.
297    pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
298        let file = File::create(path)?;
299        let writer = BufWriter::with_capacity(256 * 1024, file);
300        serde_json::to_writer_pretty(writer, self)?;
301        Ok(())
302    }
303}
304
305#[cfg(test)]
306#[allow(clippy::unwrap_used)]
307mod tests {
308    use super::*;
309    use chrono::NaiveDate;
310    use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
311    use tempfile::TempDir;
312
313    fn create_test_labels() -> Vec<LabeledAnomaly> {
314        vec![
315            LabeledAnomaly::new(
316                "ANO001".to_string(),
317                AnomalyType::Fraud(FraudType::SelfApproval),
318                "JE001".to_string(),
319                "JE".to_string(),
320                "1000".to_string(),
321                NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
322            )
323            .with_run_id("run-123")
324            .with_generation_seed(42)
325            .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
326            LabeledAnomaly::new(
327                "ANO002".to_string(),
328                AnomalyType::Fraud(FraudType::DuplicatePayment),
329                "JE002".to_string(),
330                "JE".to_string(),
331                "1000".to_string(),
332                NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
333            )
334            .with_cluster("cluster-1"),
335        ]
336    }
337
338    #[test]
339    fn test_export_csv() {
340        let temp_dir = TempDir::new().unwrap();
341        let labels = create_test_labels();
342        let config = LabelExportConfig::default();
343
344        let path = temp_dir.path().join("labels.csv");
345        let count = export_labels_csv(&labels, &path, &config).unwrap();
346
347        assert_eq!(count, 2);
348        assert!(path.exists());
349
350        let contents = std::fs::read_to_string(&path).unwrap();
351        assert!(contents.contains("ANO001"));
352        assert!(contents.contains("SelfApproval"));
353    }
354
355    #[test]
356    fn test_export_json() {
357        let temp_dir = TempDir::new().unwrap();
358        let labels = create_test_labels();
359        let config = LabelExportConfig::default();
360
361        let path = temp_dir.path().join("labels.json");
362        let count = export_labels_json(&labels, &path, &config).unwrap();
363
364        assert_eq!(count, 2);
365        assert!(path.exists());
366
367        let contents = std::fs::read_to_string(&path).unwrap();
368        assert!(contents.contains("ANO001"));
369        assert!(contents.contains("run-123"));
370    }
371
372    #[test]
373    fn test_export_jsonl() {
374        let temp_dir = TempDir::new().unwrap();
375        let labels = create_test_labels();
376        let config = LabelExportConfig::default();
377
378        let path = temp_dir.path().join("labels.jsonl");
379        let count = export_labels_jsonl(&labels, &path, &config).unwrap();
380
381        assert_eq!(count, 2);
382        assert!(path.exists());
383
384        let contents = std::fs::read_to_string(&path).unwrap();
385        let lines: Vec<&str> = contents.lines().collect();
386        assert_eq!(lines.len(), 2);
387    }
388
389    #[test]
390    fn test_export_all_formats() {
391        let temp_dir = TempDir::new().unwrap();
392        let labels = create_test_labels();
393        let config = LabelExportConfig::default();
394
395        let results =
396            export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
397
398        assert_eq!(results.len(), 3);
399        assert!(temp_dir.path().join("anomaly_labels.csv").exists());
400        assert!(temp_dir.path().join("anomaly_labels.json").exists());
401        assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
402    }
403
404    #[test]
405    fn test_label_export_summary() {
406        let labels = create_test_labels();
407        let summary = LabelExportSummary::from_labels(&labels);
408
409        assert_eq!(summary.total_labels, 2);
410        assert_eq!(summary.by_category.get("Fraud"), Some(&2));
411        assert_eq!(summary.with_provenance, 1);
412        assert_eq!(summary.in_clusters, 1);
413    }
414
415    #[test]
416    fn test_flat_label_conversion() {
417        let label = LabeledAnomaly::new(
418            "ANO001".to_string(),
419            AnomalyType::Fraud(FraudType::SelfApproval),
420            "JE001".to_string(),
421            "JE".to_string(),
422            "1000".to_string(),
423            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
424        )
425        .with_run_id("run-123")
426        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
427
428        let flat: FlatAnomalyLabel = (&label).into();
429
430        assert_eq!(flat.anomaly_id, "ANO001");
431        assert_eq!(flat.anomaly_category, "Fraud");
432        assert_eq!(flat.run_id, Some("run-123".to_string()));
433        assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
434    }
435}