1use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15 #[error("IO error: {0}")]
16 Io(#[from] std::io::Error),
17 #[error("CSV error: {0}")]
18 Csv(#[from] csv::Error),
19 #[error("JSON error: {0}")]
20 Json(#[from] serde_json::Error),
21}
22
23pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30 pub anomaly_id: String,
32 pub anomaly_category: String,
33 pub anomaly_type: String,
34 pub document_id: String,
35 pub document_type: String,
36 pub company_code: String,
37 pub anomaly_date: String,
38 pub detection_timestamp: String,
39 pub confidence: f64,
40 pub severity: u8,
41 pub observability: String,
44 pub description: String,
45 pub is_injected: bool,
46 pub monetary_impact: Option<String>,
47 pub related_entities: String, pub cluster_id: Option<String>,
49
50 pub original_document_hash: Option<String>,
52 pub injection_strategy: Option<String>,
53 pub structured_strategy_type: Option<String>,
54 pub structured_strategy_json: Option<String>,
55 pub causal_reason_type: Option<String>,
56 pub causal_reason_json: Option<String>,
57 pub parent_anomaly_id: Option<String>,
58 pub child_anomaly_ids: String, pub scenario_id: Option<String>,
60 pub run_id: Option<String>,
61 pub generation_seed: Option<u64>,
62
63 pub metadata_json: String,
65}
66
67fn serialize_or_warn<T: serde::Serialize>(value: &T, field_name: &str) -> String {
68 serde_json::to_string(value).unwrap_or_else(|e| {
69 tracing::warn!("Failed to serialize {} for label export: {}", field_name, e);
70 String::new()
71 })
72}
73
74impl From<&LabeledAnomaly> for FlatAnomalyLabel {
75 fn from(label: &LabeledAnomaly) -> Self {
76 Self {
77 anomaly_id: label.anomaly_id.clone(),
78 anomaly_category: label.anomaly_type.category().to_string(),
79 anomaly_type: label.anomaly_type.type_name(),
80 document_id: label.document_id.clone(),
81 document_type: label.document_type.clone(),
82 company_code: label.company_code.clone(),
83 anomaly_date: label.anomaly_date.to_string(),
84 detection_timestamp: label.detection_timestamp.to_string(),
85 confidence: label.confidence,
86 severity: label.severity,
87 observability: label.observability.as_str().to_string(),
88 description: label.description.clone(),
89 is_injected: label.is_injected,
90 monetary_impact: label.monetary_impact.map(|d| d.to_string()),
91 related_entities: serialize_or_warn(&label.related_entities, "related_entities"),
92 cluster_id: label.cluster_id.clone(),
93
94 original_document_hash: label.original_document_hash.clone(),
96 injection_strategy: label.injection_strategy.clone(),
97 structured_strategy_type: label
98 .structured_strategy
99 .as_ref()
100 .map(|s| s.strategy_type().to_string()),
101 structured_strategy_json: label
102 .structured_strategy
103 .as_ref()
104 .map(|s| serialize_or_warn(s, "structured_strategy")),
105 causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
106 datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
107 "RandomRate".to_string()
108 }
109 datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
110 "TemporalPattern".to_string()
111 }
112 datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
113 "EntityTargeting".to_string()
114 }
115 datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
116 "ClusterMembership".to_string()
117 }
118 datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
119 "ScenarioStep".to_string()
120 }
121 datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
122 "DataQualityProfile".to_string()
123 }
124 datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
125 "MLTrainingBalance".to_string()
126 }
127 }),
128 causal_reason_json: label
129 .causal_reason
130 .as_ref()
131 .map(|r| serialize_or_warn(r, "causal_reason")),
132 parent_anomaly_id: label.parent_anomaly_id.clone(),
133 child_anomaly_ids: serialize_or_warn(&label.child_anomaly_ids, "child_anomaly_ids"),
134 scenario_id: label.scenario_id.clone(),
135 run_id: label.run_id.clone(),
136 generation_seed: label.generation_seed,
137
138 metadata_json: serialize_or_warn(&label.metadata, "metadata"),
139 }
140 }
141}
142
143#[derive(Debug, Clone)]
145pub struct LabelExportConfig {
146 pub include_provenance: bool,
148 pub include_metadata: bool,
150 pub pretty_json: bool,
152}
153
154impl Default for LabelExportConfig {
155 fn default() -> Self {
156 Self {
157 include_provenance: true,
158 include_metadata: true,
159 pretty_json: true,
160 }
161 }
162}
163
164pub fn export_labels_csv(
166 labels: &[LabeledAnomaly],
167 path: &Path,
168 _config: &LabelExportConfig,
169) -> LabelExportResult<usize> {
170 let file = File::create(path)?;
171 let mut writer = csv::Writer::from_writer(BufWriter::with_capacity(256 * 1024, file));
172
173 for label in labels {
174 let flat: FlatAnomalyLabel = label.into();
175 writer.serialize(&flat)?;
176 }
177
178 writer.flush()?;
179 Ok(labels.len())
180}
181
182pub fn export_labels_json(
184 labels: &[LabeledAnomaly],
185 path: &Path,
186 config: &LabelExportConfig,
187) -> LabelExportResult<usize> {
188 let file = File::create(path)?;
189 let writer = BufWriter::with_capacity(256 * 1024, file);
190
191 if config.pretty_json {
192 serde_json::to_writer_pretty(writer, labels)?;
193 } else {
194 serde_json::to_writer(writer, labels)?;
195 }
196
197 Ok(labels.len())
198}
199
200pub fn export_labels_jsonl(
202 labels: &[LabeledAnomaly],
203 path: &Path,
204 _config: &LabelExportConfig,
205) -> LabelExportResult<usize> {
206 let file = File::create(path)?;
207 let mut writer = BufWriter::with_capacity(256 * 1024, file);
208
209 for label in labels {
210 let json = serde_json::to_string(label)?;
211 writeln!(writer, "{json}")?;
212 }
213
214 writer.flush()?;
215 Ok(labels.len())
216}
217
218pub fn export_labels_all_formats(
220 labels: &[LabeledAnomaly],
221 output_dir: &Path,
222 base_name: &str,
223 config: &LabelExportConfig,
224) -> LabelExportResult<Vec<(String, usize)>> {
225 std::fs::create_dir_all(output_dir)?;
226
227 let mut results = Vec::new();
228
229 let csv_path = output_dir.join(format!("{base_name}.csv"));
231 let count = export_labels_csv(labels, &csv_path, config)?;
232 results.push((csv_path.display().to_string(), count));
233
234 let json_path = output_dir.join(format!("{base_name}.json"));
236 let count = export_labels_json(labels, &json_path, config)?;
237 results.push((json_path.display().to_string(), count));
238
239 let jsonl_path = output_dir.join(format!("{base_name}.jsonl"));
241 let count = export_labels_jsonl(labels, &jsonl_path, config)?;
242 results.push((jsonl_path.display().to_string(), count));
243
244 Ok(results)
245}
246
247#[derive(Debug, Clone, Serialize)]
249pub struct LabelExportSummary {
250 pub total_labels: usize,
252 pub by_category: std::collections::HashMap<String, usize>,
254 pub by_company: std::collections::HashMap<String, usize>,
256 pub with_provenance: usize,
258 pub in_scenarios: usize,
260 pub in_clusters: usize,
262}
263
264impl LabelExportSummary {
265 pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
267 let mut by_category = std::collections::HashMap::new();
268 let mut by_company = std::collections::HashMap::new();
269 let mut with_provenance = 0;
270 let mut in_scenarios = 0;
271 let mut in_clusters = 0;
272
273 for label in labels {
274 *by_category
275 .entry(label.anomaly_type.category().to_string())
276 .or_insert(0) += 1;
277 *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
278
279 if label.causal_reason.is_some() || label.structured_strategy.is_some() {
280 with_provenance += 1;
281 }
282 if label.scenario_id.is_some() {
283 in_scenarios += 1;
284 }
285 if label.cluster_id.is_some() {
286 in_clusters += 1;
287 }
288 }
289
290 Self {
291 total_labels: labels.len(),
292 by_category,
293 by_company,
294 with_provenance,
295 in_scenarios,
296 in_clusters,
297 }
298 }
299
300 pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
302 let file = File::create(path)?;
303 let writer = BufWriter::with_capacity(256 * 1024, file);
304 serde_json::to_writer_pretty(writer, self)?;
305 Ok(())
306 }
307}
308
309#[cfg(test)]
310mod tests {
311 use super::*;
312 use chrono::NaiveDate;
313 use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
314 use tempfile::TempDir;
315
316 fn create_test_labels() -> Vec<LabeledAnomaly> {
317 vec![
318 LabeledAnomaly::new(
319 "ANO001".to_string(),
320 AnomalyType::Fraud(FraudType::SelfApproval),
321 "JE001".to_string(),
322 "JE".to_string(),
323 "1000".to_string(),
324 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
325 )
326 .with_run_id("run-123")
327 .with_generation_seed(42)
328 .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
329 LabeledAnomaly::new(
330 "ANO002".to_string(),
331 AnomalyType::Fraud(FraudType::DuplicatePayment),
332 "JE002".to_string(),
333 "JE".to_string(),
334 "1000".to_string(),
335 NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
336 )
337 .with_cluster("cluster-1"),
338 ]
339 }
340
341 #[test]
342 fn test_export_csv() {
343 let temp_dir = TempDir::new().unwrap();
344 let labels = create_test_labels();
345 let config = LabelExportConfig::default();
346
347 let path = temp_dir.path().join("labels.csv");
348 let count = export_labels_csv(&labels, &path, &config).unwrap();
349
350 assert_eq!(count, 2);
351 assert!(path.exists());
352
353 let contents = std::fs::read_to_string(&path).unwrap();
354 assert!(contents.contains("ANO001"));
355 assert!(contents.contains("SelfApproval"));
356 }
357
358 #[test]
359 fn test_export_json() {
360 let temp_dir = TempDir::new().unwrap();
361 let labels = create_test_labels();
362 let config = LabelExportConfig::default();
363
364 let path = temp_dir.path().join("labels.json");
365 let count = export_labels_json(&labels, &path, &config).unwrap();
366
367 assert_eq!(count, 2);
368 assert!(path.exists());
369
370 let contents = std::fs::read_to_string(&path).unwrap();
371 assert!(contents.contains("ANO001"));
372 assert!(contents.contains("run-123"));
373 }
374
375 #[test]
376 fn test_export_jsonl() {
377 let temp_dir = TempDir::new().unwrap();
378 let labels = create_test_labels();
379 let config = LabelExportConfig::default();
380
381 let path = temp_dir.path().join("labels.jsonl");
382 let count = export_labels_jsonl(&labels, &path, &config).unwrap();
383
384 assert_eq!(count, 2);
385 assert!(path.exists());
386
387 let contents = std::fs::read_to_string(&path).unwrap();
388 let lines: Vec<&str> = contents.lines().collect();
389 assert_eq!(lines.len(), 2);
390 }
391
392 #[test]
393 fn test_export_all_formats() {
394 let temp_dir = TempDir::new().unwrap();
395 let labels = create_test_labels();
396 let config = LabelExportConfig::default();
397
398 let results =
399 export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
400
401 assert_eq!(results.len(), 3);
402 assert!(temp_dir.path().join("anomaly_labels.csv").exists());
403 assert!(temp_dir.path().join("anomaly_labels.json").exists());
404 assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
405 }
406
407 #[test]
408 fn test_label_export_summary() {
409 let labels = create_test_labels();
410 let summary = LabelExportSummary::from_labels(&labels);
411
412 assert_eq!(summary.total_labels, 2);
413 assert_eq!(summary.by_category.get("Fraud"), Some(&2));
414 assert_eq!(summary.with_provenance, 1);
415 assert_eq!(summary.in_clusters, 1);
416 }
417
418 #[test]
419 fn test_flat_label_conversion() {
420 let label = LabeledAnomaly::new(
421 "ANO001".to_string(),
422 AnomalyType::Fraud(FraudType::SelfApproval),
423 "JE001".to_string(),
424 "JE".to_string(),
425 "1000".to_string(),
426 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
427 )
428 .with_run_id("run-123")
429 .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
430
431 let flat: FlatAnomalyLabel = (&label).into();
432
433 assert_eq!(flat.anomaly_id, "ANO001");
434 assert_eq!(flat.anomaly_category, "Fraud");
435 assert_eq!(flat.run_id, Some("run-123".to_string()));
436 assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
437 }
438}