Skip to main content

datasynth_runtime/
label_export.rs

1//! Anomaly label export functionality.
2//!
3//! This module provides functions for exporting anomaly labels to various formats
4//! (CSV, JSON, JSON Lines) for ML training and audit purposes.
5
6use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12/// Error type for label export operations.
13#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15    #[error("IO error: {0}")]
16    Io(#[from] std::io::Error),
17    #[error("CSV error: {0}")]
18    Csv(#[from] csv::Error),
19    #[error("JSON error: {0}")]
20    Json(#[from] serde_json::Error),
21}
22
23/// Result type for label export operations.
24pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26/// Flattened anomaly label for CSV export.
27/// This structure converts nested fields to flat columns for easier CSV handling.
28#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30    // Core fields
31    pub anomaly_id: String,
32    pub anomaly_category: String,
33    pub anomaly_type: String,
34    pub document_id: String,
35    pub document_type: String,
36    pub company_code: String,
37    pub anomaly_date: String,
38    pub detection_timestamp: String,
39    pub confidence: f64,
40    pub severity: u8,
41    pub description: String,
42    pub is_injected: bool,
43    pub monetary_impact: Option<String>,
44    pub related_entities: String, // JSON array as string
45    pub cluster_id: Option<String>,
46
47    // Provenance fields
48    pub original_document_hash: Option<String>,
49    pub injection_strategy: Option<String>,
50    pub structured_strategy_type: Option<String>,
51    pub structured_strategy_json: Option<String>,
52    pub causal_reason_type: Option<String>,
53    pub causal_reason_json: Option<String>,
54    pub parent_anomaly_id: Option<String>,
55    pub child_anomaly_ids: String, // JSON array as string
56    pub scenario_id: Option<String>,
57    pub run_id: Option<String>,
58    pub generation_seed: Option<u64>,
59
60    // Metadata as JSON
61    pub metadata_json: String,
62}
63
64fn serialize_or_warn<T: serde::Serialize>(value: &T, field_name: &str) -> String {
65    serde_json::to_string(value).unwrap_or_else(|e| {
66        tracing::warn!("Failed to serialize {} for label export: {}", field_name, e);
67        String::new()
68    })
69}
70
71impl From<&LabeledAnomaly> for FlatAnomalyLabel {
72    fn from(label: &LabeledAnomaly) -> Self {
73        Self {
74            anomaly_id: label.anomaly_id.clone(),
75            anomaly_category: label.anomaly_type.category().to_string(),
76            anomaly_type: label.anomaly_type.type_name(),
77            document_id: label.document_id.clone(),
78            document_type: label.document_type.clone(),
79            company_code: label.company_code.clone(),
80            anomaly_date: label.anomaly_date.to_string(),
81            detection_timestamp: label.detection_timestamp.to_string(),
82            confidence: label.confidence,
83            severity: label.severity,
84            description: label.description.clone(),
85            is_injected: label.is_injected,
86            monetary_impact: label.monetary_impact.map(|d| d.to_string()),
87            related_entities: serialize_or_warn(&label.related_entities, "related_entities"),
88            cluster_id: label.cluster_id.clone(),
89
90            // Provenance fields
91            original_document_hash: label.original_document_hash.clone(),
92            injection_strategy: label.injection_strategy.clone(),
93            structured_strategy_type: label
94                .structured_strategy
95                .as_ref()
96                .map(|s| s.strategy_type().to_string()),
97            structured_strategy_json: label
98                .structured_strategy
99                .as_ref()
100                .map(|s| serialize_or_warn(s, "structured_strategy")),
101            causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
102                datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
103                    "RandomRate".to_string()
104                }
105                datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
106                    "TemporalPattern".to_string()
107                }
108                datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
109                    "EntityTargeting".to_string()
110                }
111                datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
112                    "ClusterMembership".to_string()
113                }
114                datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
115                    "ScenarioStep".to_string()
116                }
117                datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
118                    "DataQualityProfile".to_string()
119                }
120                datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
121                    "MLTrainingBalance".to_string()
122                }
123            }),
124            causal_reason_json: label
125                .causal_reason
126                .as_ref()
127                .map(|r| serialize_or_warn(r, "causal_reason")),
128            parent_anomaly_id: label.parent_anomaly_id.clone(),
129            child_anomaly_ids: serialize_or_warn(&label.child_anomaly_ids, "child_anomaly_ids"),
130            scenario_id: label.scenario_id.clone(),
131            run_id: label.run_id.clone(),
132            generation_seed: label.generation_seed,
133
134            metadata_json: serialize_or_warn(&label.metadata, "metadata"),
135        }
136    }
137}
138
139/// Configuration for label export.
140#[derive(Debug, Clone)]
141pub struct LabelExportConfig {
142    /// Whether to include all provenance fields.
143    pub include_provenance: bool,
144    /// Whether to include metadata JSON.
145    pub include_metadata: bool,
146    /// Whether to pretty-print JSON output.
147    pub pretty_json: bool,
148}
149
150impl Default for LabelExportConfig {
151    fn default() -> Self {
152        Self {
153            include_provenance: true,
154            include_metadata: true,
155            pretty_json: true,
156        }
157    }
158}
159
160/// Exports anomaly labels to a CSV file.
161pub fn export_labels_csv(
162    labels: &[LabeledAnomaly],
163    path: &Path,
164    _config: &LabelExportConfig,
165) -> LabelExportResult<usize> {
166    let file = File::create(path)?;
167    let mut writer = csv::Writer::from_writer(BufWriter::with_capacity(256 * 1024, file));
168
169    for label in labels {
170        let flat: FlatAnomalyLabel = label.into();
171        writer.serialize(&flat)?;
172    }
173
174    writer.flush()?;
175    Ok(labels.len())
176}
177
178/// Exports anomaly labels to a JSON file (array format).
179pub fn export_labels_json(
180    labels: &[LabeledAnomaly],
181    path: &Path,
182    config: &LabelExportConfig,
183) -> LabelExportResult<usize> {
184    let file = File::create(path)?;
185    let writer = BufWriter::with_capacity(256 * 1024, file);
186
187    if config.pretty_json {
188        serde_json::to_writer_pretty(writer, labels)?;
189    } else {
190        serde_json::to_writer(writer, labels)?;
191    }
192
193    Ok(labels.len())
194}
195
196/// Exports anomaly labels to a JSON Lines file (one JSON object per line).
197pub fn export_labels_jsonl(
198    labels: &[LabeledAnomaly],
199    path: &Path,
200    _config: &LabelExportConfig,
201) -> LabelExportResult<usize> {
202    let file = File::create(path)?;
203    let mut writer = BufWriter::with_capacity(256 * 1024, file);
204
205    for label in labels {
206        let json = serde_json::to_string(label)?;
207        writeln!(writer, "{json}")?;
208    }
209
210    writer.flush()?;
211    Ok(labels.len())
212}
213
214/// Exports anomaly labels to multiple formats at once.
215pub fn export_labels_all_formats(
216    labels: &[LabeledAnomaly],
217    output_dir: &Path,
218    base_name: &str,
219    config: &LabelExportConfig,
220) -> LabelExportResult<Vec<(String, usize)>> {
221    std::fs::create_dir_all(output_dir)?;
222
223    let mut results = Vec::new();
224
225    // CSV
226    let csv_path = output_dir.join(format!("{base_name}.csv"));
227    let count = export_labels_csv(labels, &csv_path, config)?;
228    results.push((csv_path.display().to_string(), count));
229
230    // JSON
231    let json_path = output_dir.join(format!("{base_name}.json"));
232    let count = export_labels_json(labels, &json_path, config)?;
233    results.push((json_path.display().to_string(), count));
234
235    // JSONL
236    let jsonl_path = output_dir.join(format!("{base_name}.jsonl"));
237    let count = export_labels_jsonl(labels, &jsonl_path, config)?;
238    results.push((jsonl_path.display().to_string(), count));
239
240    Ok(results)
241}
242
243/// Summary statistics for exported labels.
244#[derive(Debug, Clone, Serialize)]
245pub struct LabelExportSummary {
246    /// Total labels exported.
247    pub total_labels: usize,
248    /// Labels by category.
249    pub by_category: std::collections::HashMap<String, usize>,
250    /// Labels by company.
251    pub by_company: std::collections::HashMap<String, usize>,
252    /// Labels with provenance.
253    pub with_provenance: usize,
254    /// Labels in scenarios.
255    pub in_scenarios: usize,
256    /// Labels in clusters.
257    pub in_clusters: usize,
258}
259
260impl LabelExportSummary {
261    /// Creates a summary from a list of labels.
262    pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
263        let mut by_category = std::collections::HashMap::new();
264        let mut by_company = std::collections::HashMap::new();
265        let mut with_provenance = 0;
266        let mut in_scenarios = 0;
267        let mut in_clusters = 0;
268
269        for label in labels {
270            *by_category
271                .entry(label.anomaly_type.category().to_string())
272                .or_insert(0) += 1;
273            *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
274
275            if label.causal_reason.is_some() || label.structured_strategy.is_some() {
276                with_provenance += 1;
277            }
278            if label.scenario_id.is_some() {
279                in_scenarios += 1;
280            }
281            if label.cluster_id.is_some() {
282                in_clusters += 1;
283            }
284        }
285
286        Self {
287            total_labels: labels.len(),
288            by_category,
289            by_company,
290            with_provenance,
291            in_scenarios,
292            in_clusters,
293        }
294    }
295
296    /// Writes the summary to a JSON file.
297    pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
298        let file = File::create(path)?;
299        let writer = BufWriter::with_capacity(256 * 1024, file);
300        serde_json::to_writer_pretty(writer, self)?;
301        Ok(())
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308    use chrono::NaiveDate;
309    use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
310    use tempfile::TempDir;
311
312    fn create_test_labels() -> Vec<LabeledAnomaly> {
313        vec![
314            LabeledAnomaly::new(
315                "ANO001".to_string(),
316                AnomalyType::Fraud(FraudType::SelfApproval),
317                "JE001".to_string(),
318                "JE".to_string(),
319                "1000".to_string(),
320                NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
321            )
322            .with_run_id("run-123")
323            .with_generation_seed(42)
324            .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
325            LabeledAnomaly::new(
326                "ANO002".to_string(),
327                AnomalyType::Fraud(FraudType::DuplicatePayment),
328                "JE002".to_string(),
329                "JE".to_string(),
330                "1000".to_string(),
331                NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
332            )
333            .with_cluster("cluster-1"),
334        ]
335    }
336
337    #[test]
338    fn test_export_csv() {
339        let temp_dir = TempDir::new().unwrap();
340        let labels = create_test_labels();
341        let config = LabelExportConfig::default();
342
343        let path = temp_dir.path().join("labels.csv");
344        let count = export_labels_csv(&labels, &path, &config).unwrap();
345
346        assert_eq!(count, 2);
347        assert!(path.exists());
348
349        let contents = std::fs::read_to_string(&path).unwrap();
350        assert!(contents.contains("ANO001"));
351        assert!(contents.contains("SelfApproval"));
352    }
353
354    #[test]
355    fn test_export_json() {
356        let temp_dir = TempDir::new().unwrap();
357        let labels = create_test_labels();
358        let config = LabelExportConfig::default();
359
360        let path = temp_dir.path().join("labels.json");
361        let count = export_labels_json(&labels, &path, &config).unwrap();
362
363        assert_eq!(count, 2);
364        assert!(path.exists());
365
366        let contents = std::fs::read_to_string(&path).unwrap();
367        assert!(contents.contains("ANO001"));
368        assert!(contents.contains("run-123"));
369    }
370
371    #[test]
372    fn test_export_jsonl() {
373        let temp_dir = TempDir::new().unwrap();
374        let labels = create_test_labels();
375        let config = LabelExportConfig::default();
376
377        let path = temp_dir.path().join("labels.jsonl");
378        let count = export_labels_jsonl(&labels, &path, &config).unwrap();
379
380        assert_eq!(count, 2);
381        assert!(path.exists());
382
383        let contents = std::fs::read_to_string(&path).unwrap();
384        let lines: Vec<&str> = contents.lines().collect();
385        assert_eq!(lines.len(), 2);
386    }
387
388    #[test]
389    fn test_export_all_formats() {
390        let temp_dir = TempDir::new().unwrap();
391        let labels = create_test_labels();
392        let config = LabelExportConfig::default();
393
394        let results =
395            export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
396
397        assert_eq!(results.len(), 3);
398        assert!(temp_dir.path().join("anomaly_labels.csv").exists());
399        assert!(temp_dir.path().join("anomaly_labels.json").exists());
400        assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
401    }
402
403    #[test]
404    fn test_label_export_summary() {
405        let labels = create_test_labels();
406        let summary = LabelExportSummary::from_labels(&labels);
407
408        assert_eq!(summary.total_labels, 2);
409        assert_eq!(summary.by_category.get("Fraud"), Some(&2));
410        assert_eq!(summary.with_provenance, 1);
411        assert_eq!(summary.in_clusters, 1);
412    }
413
414    #[test]
415    fn test_flat_label_conversion() {
416        let label = LabeledAnomaly::new(
417            "ANO001".to_string(),
418            AnomalyType::Fraud(FraudType::SelfApproval),
419            "JE001".to_string(),
420            "JE".to_string(),
421            "1000".to_string(),
422            NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
423        )
424        .with_run_id("run-123")
425        .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
426
427        let flat: FlatAnomalyLabel = (&label).into();
428
429        assert_eq!(flat.anomaly_id, "ANO001");
430        assert_eq!(flat.anomaly_category, "Fraud");
431        assert_eq!(flat.run_id, Some("run-123".to_string()));
432        assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
433    }
434}