1use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15 #[error("IO error: {0}")]
16 Io(#[from] std::io::Error),
17 #[error("CSV error: {0}")]
18 Csv(#[from] csv::Error),
19 #[error("JSON error: {0}")]
20 Json(#[from] serde_json::Error),
21}
22
23pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30 pub anomaly_id: String,
32 pub anomaly_category: String,
33 pub anomaly_type: String,
34 pub document_id: String,
35 pub document_type: String,
36 pub company_code: String,
37 pub anomaly_date: String,
38 pub detection_timestamp: String,
39 pub confidence: f64,
40 pub severity: u8,
41 pub description: String,
42 pub is_injected: bool,
43 pub monetary_impact: Option<String>,
44 pub related_entities: String, pub cluster_id: Option<String>,
46
47 pub original_document_hash: Option<String>,
49 pub injection_strategy: Option<String>,
50 pub structured_strategy_type: Option<String>,
51 pub structured_strategy_json: Option<String>,
52 pub causal_reason_type: Option<String>,
53 pub causal_reason_json: Option<String>,
54 pub parent_anomaly_id: Option<String>,
55 pub child_anomaly_ids: String, pub scenario_id: Option<String>,
57 pub run_id: Option<String>,
58 pub generation_seed: Option<u64>,
59
60 pub metadata_json: String,
62}
63
64fn serialize_or_warn<T: serde::Serialize>(value: &T, field_name: &str) -> String {
65 serde_json::to_string(value).unwrap_or_else(|e| {
66 tracing::warn!("Failed to serialize {} for label export: {}", field_name, e);
67 String::new()
68 })
69}
70
71impl From<&LabeledAnomaly> for FlatAnomalyLabel {
72 fn from(label: &LabeledAnomaly) -> Self {
73 Self {
74 anomaly_id: label.anomaly_id.clone(),
75 anomaly_category: label.anomaly_type.category().to_string(),
76 anomaly_type: label.anomaly_type.type_name(),
77 document_id: label.document_id.clone(),
78 document_type: label.document_type.clone(),
79 company_code: label.company_code.clone(),
80 anomaly_date: label.anomaly_date.to_string(),
81 detection_timestamp: label.detection_timestamp.to_string(),
82 confidence: label.confidence,
83 severity: label.severity,
84 description: label.description.clone(),
85 is_injected: label.is_injected,
86 monetary_impact: label.monetary_impact.map(|d| d.to_string()),
87 related_entities: serialize_or_warn(&label.related_entities, "related_entities"),
88 cluster_id: label.cluster_id.clone(),
89
90 original_document_hash: label.original_document_hash.clone(),
92 injection_strategy: label.injection_strategy.clone(),
93 structured_strategy_type: label
94 .structured_strategy
95 .as_ref()
96 .map(|s| s.strategy_type().to_string()),
97 structured_strategy_json: label
98 .structured_strategy
99 .as_ref()
100 .map(|s| serialize_or_warn(s, "structured_strategy")),
101 causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
102 datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
103 "RandomRate".to_string()
104 }
105 datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
106 "TemporalPattern".to_string()
107 }
108 datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
109 "EntityTargeting".to_string()
110 }
111 datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
112 "ClusterMembership".to_string()
113 }
114 datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
115 "ScenarioStep".to_string()
116 }
117 datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
118 "DataQualityProfile".to_string()
119 }
120 datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
121 "MLTrainingBalance".to_string()
122 }
123 }),
124 causal_reason_json: label
125 .causal_reason
126 .as_ref()
127 .map(|r| serialize_or_warn(r, "causal_reason")),
128 parent_anomaly_id: label.parent_anomaly_id.clone(),
129 child_anomaly_ids: serialize_or_warn(&label.child_anomaly_ids, "child_anomaly_ids"),
130 scenario_id: label.scenario_id.clone(),
131 run_id: label.run_id.clone(),
132 generation_seed: label.generation_seed,
133
134 metadata_json: serialize_or_warn(&label.metadata, "metadata"),
135 }
136 }
137}
138
139#[derive(Debug, Clone)]
141pub struct LabelExportConfig {
142 pub include_provenance: bool,
144 pub include_metadata: bool,
146 pub pretty_json: bool,
148}
149
150impl Default for LabelExportConfig {
151 fn default() -> Self {
152 Self {
153 include_provenance: true,
154 include_metadata: true,
155 pretty_json: true,
156 }
157 }
158}
159
160pub fn export_labels_csv(
162 labels: &[LabeledAnomaly],
163 path: &Path,
164 _config: &LabelExportConfig,
165) -> LabelExportResult<usize> {
166 let file = File::create(path)?;
167 let mut writer = csv::Writer::from_writer(BufWriter::with_capacity(256 * 1024, file));
168
169 for label in labels {
170 let flat: FlatAnomalyLabel = label.into();
171 writer.serialize(&flat)?;
172 }
173
174 writer.flush()?;
175 Ok(labels.len())
176}
177
178pub fn export_labels_json(
180 labels: &[LabeledAnomaly],
181 path: &Path,
182 config: &LabelExportConfig,
183) -> LabelExportResult<usize> {
184 let file = File::create(path)?;
185 let writer = BufWriter::with_capacity(256 * 1024, file);
186
187 if config.pretty_json {
188 serde_json::to_writer_pretty(writer, labels)?;
189 } else {
190 serde_json::to_writer(writer, labels)?;
191 }
192
193 Ok(labels.len())
194}
195
196pub fn export_labels_jsonl(
198 labels: &[LabeledAnomaly],
199 path: &Path,
200 _config: &LabelExportConfig,
201) -> LabelExportResult<usize> {
202 let file = File::create(path)?;
203 let mut writer = BufWriter::with_capacity(256 * 1024, file);
204
205 for label in labels {
206 let json = serde_json::to_string(label)?;
207 writeln!(writer, "{}", json)?;
208 }
209
210 writer.flush()?;
211 Ok(labels.len())
212}
213
214pub fn export_labels_all_formats(
216 labels: &[LabeledAnomaly],
217 output_dir: &Path,
218 base_name: &str,
219 config: &LabelExportConfig,
220) -> LabelExportResult<Vec<(String, usize)>> {
221 std::fs::create_dir_all(output_dir)?;
222
223 let mut results = Vec::new();
224
225 let csv_path = output_dir.join(format!("{}.csv", base_name));
227 let count = export_labels_csv(labels, &csv_path, config)?;
228 results.push((csv_path.display().to_string(), count));
229
230 let json_path = output_dir.join(format!("{}.json", base_name));
232 let count = export_labels_json(labels, &json_path, config)?;
233 results.push((json_path.display().to_string(), count));
234
235 let jsonl_path = output_dir.join(format!("{}.jsonl", base_name));
237 let count = export_labels_jsonl(labels, &jsonl_path, config)?;
238 results.push((jsonl_path.display().to_string(), count));
239
240 Ok(results)
241}
242
243#[derive(Debug, Clone, Serialize)]
245pub struct LabelExportSummary {
246 pub total_labels: usize,
248 pub by_category: std::collections::HashMap<String, usize>,
250 pub by_company: std::collections::HashMap<String, usize>,
252 pub with_provenance: usize,
254 pub in_scenarios: usize,
256 pub in_clusters: usize,
258}
259
260impl LabelExportSummary {
261 pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
263 let mut by_category = std::collections::HashMap::new();
264 let mut by_company = std::collections::HashMap::new();
265 let mut with_provenance = 0;
266 let mut in_scenarios = 0;
267 let mut in_clusters = 0;
268
269 for label in labels {
270 *by_category
271 .entry(label.anomaly_type.category().to_string())
272 .or_insert(0) += 1;
273 *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
274
275 if label.causal_reason.is_some() || label.structured_strategy.is_some() {
276 with_provenance += 1;
277 }
278 if label.scenario_id.is_some() {
279 in_scenarios += 1;
280 }
281 if label.cluster_id.is_some() {
282 in_clusters += 1;
283 }
284 }
285
286 Self {
287 total_labels: labels.len(),
288 by_category,
289 by_company,
290 with_provenance,
291 in_scenarios,
292 in_clusters,
293 }
294 }
295
296 pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
298 let file = File::create(path)?;
299 let writer = BufWriter::with_capacity(256 * 1024, file);
300 serde_json::to_writer_pretty(writer, self)?;
301 Ok(())
302 }
303}
304
305#[cfg(test)]
306#[allow(clippy::unwrap_used)]
307mod tests {
308 use super::*;
309 use chrono::NaiveDate;
310 use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
311 use tempfile::TempDir;
312
313 fn create_test_labels() -> Vec<LabeledAnomaly> {
314 vec![
315 LabeledAnomaly::new(
316 "ANO001".to_string(),
317 AnomalyType::Fraud(FraudType::SelfApproval),
318 "JE001".to_string(),
319 "JE".to_string(),
320 "1000".to_string(),
321 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
322 )
323 .with_run_id("run-123")
324 .with_generation_seed(42)
325 .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
326 LabeledAnomaly::new(
327 "ANO002".to_string(),
328 AnomalyType::Fraud(FraudType::DuplicatePayment),
329 "JE002".to_string(),
330 "JE".to_string(),
331 "1000".to_string(),
332 NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
333 )
334 .with_cluster("cluster-1"),
335 ]
336 }
337
338 #[test]
339 fn test_export_csv() {
340 let temp_dir = TempDir::new().unwrap();
341 let labels = create_test_labels();
342 let config = LabelExportConfig::default();
343
344 let path = temp_dir.path().join("labels.csv");
345 let count = export_labels_csv(&labels, &path, &config).unwrap();
346
347 assert_eq!(count, 2);
348 assert!(path.exists());
349
350 let contents = std::fs::read_to_string(&path).unwrap();
351 assert!(contents.contains("ANO001"));
352 assert!(contents.contains("SelfApproval"));
353 }
354
355 #[test]
356 fn test_export_json() {
357 let temp_dir = TempDir::new().unwrap();
358 let labels = create_test_labels();
359 let config = LabelExportConfig::default();
360
361 let path = temp_dir.path().join("labels.json");
362 let count = export_labels_json(&labels, &path, &config).unwrap();
363
364 assert_eq!(count, 2);
365 assert!(path.exists());
366
367 let contents = std::fs::read_to_string(&path).unwrap();
368 assert!(contents.contains("ANO001"));
369 assert!(contents.contains("run-123"));
370 }
371
372 #[test]
373 fn test_export_jsonl() {
374 let temp_dir = TempDir::new().unwrap();
375 let labels = create_test_labels();
376 let config = LabelExportConfig::default();
377
378 let path = temp_dir.path().join("labels.jsonl");
379 let count = export_labels_jsonl(&labels, &path, &config).unwrap();
380
381 assert_eq!(count, 2);
382 assert!(path.exists());
383
384 let contents = std::fs::read_to_string(&path).unwrap();
385 let lines: Vec<&str> = contents.lines().collect();
386 assert_eq!(lines.len(), 2);
387 }
388
389 #[test]
390 fn test_export_all_formats() {
391 let temp_dir = TempDir::new().unwrap();
392 let labels = create_test_labels();
393 let config = LabelExportConfig::default();
394
395 let results =
396 export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
397
398 assert_eq!(results.len(), 3);
399 assert!(temp_dir.path().join("anomaly_labels.csv").exists());
400 assert!(temp_dir.path().join("anomaly_labels.json").exists());
401 assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
402 }
403
404 #[test]
405 fn test_label_export_summary() {
406 let labels = create_test_labels();
407 let summary = LabelExportSummary::from_labels(&labels);
408
409 assert_eq!(summary.total_labels, 2);
410 assert_eq!(summary.by_category.get("Fraud"), Some(&2));
411 assert_eq!(summary.with_provenance, 1);
412 assert_eq!(summary.in_clusters, 1);
413 }
414
415 #[test]
416 fn test_flat_label_conversion() {
417 let label = LabeledAnomaly::new(
418 "ANO001".to_string(),
419 AnomalyType::Fraud(FraudType::SelfApproval),
420 "JE001".to_string(),
421 "JE".to_string(),
422 "1000".to_string(),
423 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
424 )
425 .with_run_id("run-123")
426 .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
427
428 let flat: FlatAnomalyLabel = (&label).into();
429
430 assert_eq!(flat.anomaly_id, "ANO001");
431 assert_eq!(flat.anomaly_category, "Fraud");
432 assert_eq!(flat.run_id, Some("run-123".to_string()));
433 assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
434 }
435}