1use datasynth_core::models::LabeledAnomaly;
7use serde::Serialize;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12#[derive(Debug, thiserror::Error)]
14pub enum LabelExportError {
15 #[error("IO error: {0}")]
16 Io(#[from] std::io::Error),
17 #[error("CSV error: {0}")]
18 Csv(#[from] csv::Error),
19 #[error("JSON error: {0}")]
20 Json(#[from] serde_json::Error),
21}
22
23pub type LabelExportResult<T> = Result<T, LabelExportError>;
25
26#[derive(Debug, Serialize)]
29pub struct FlatAnomalyLabel {
30 pub anomaly_id: String,
32 pub anomaly_category: String,
33 pub anomaly_type: String,
34 pub document_id: String,
35 pub document_type: String,
36 pub company_code: String,
37 pub anomaly_date: String,
38 pub detection_timestamp: String,
39 pub confidence: f64,
40 pub severity: u8,
41 pub description: String,
42 pub is_injected: bool,
43 pub monetary_impact: Option<String>,
44 pub related_entities: String, pub cluster_id: Option<String>,
46
47 pub original_document_hash: Option<String>,
49 pub injection_strategy: Option<String>,
50 pub structured_strategy_type: Option<String>,
51 pub structured_strategy_json: Option<String>,
52 pub causal_reason_type: Option<String>,
53 pub causal_reason_json: Option<String>,
54 pub parent_anomaly_id: Option<String>,
55 pub child_anomaly_ids: String, pub scenario_id: Option<String>,
57 pub run_id: Option<String>,
58 pub generation_seed: Option<u64>,
59
60 pub metadata_json: String,
62}
63
64impl From<&LabeledAnomaly> for FlatAnomalyLabel {
65 fn from(label: &LabeledAnomaly) -> Self {
66 Self {
67 anomaly_id: label.anomaly_id.clone(),
68 anomaly_category: label.anomaly_type.category().to_string(),
69 anomaly_type: label.anomaly_type.type_name(),
70 document_id: label.document_id.clone(),
71 document_type: label.document_type.clone(),
72 company_code: label.company_code.clone(),
73 anomaly_date: label.anomaly_date.to_string(),
74 detection_timestamp: label.detection_timestamp.to_string(),
75 confidence: label.confidence,
76 severity: label.severity,
77 description: label.description.clone(),
78 is_injected: label.is_injected,
79 monetary_impact: label.monetary_impact.map(|d| d.to_string()),
80 related_entities: serde_json::to_string(&label.related_entities).unwrap_or_default(),
81 cluster_id: label.cluster_id.clone(),
82
83 original_document_hash: label.original_document_hash.clone(),
85 injection_strategy: label.injection_strategy.clone(),
86 structured_strategy_type: label
87 .structured_strategy
88 .as_ref()
89 .map(|s| s.strategy_type().to_string()),
90 structured_strategy_json: label
91 .structured_strategy
92 .as_ref()
93 .map(|s| serde_json::to_string(s).unwrap_or_default()),
94 causal_reason_type: label.causal_reason.as_ref().map(|r| match r {
95 datasynth_core::models::AnomalyCausalReason::RandomRate { .. } => {
96 "RandomRate".to_string()
97 }
98 datasynth_core::models::AnomalyCausalReason::TemporalPattern { .. } => {
99 "TemporalPattern".to_string()
100 }
101 datasynth_core::models::AnomalyCausalReason::EntityTargeting { .. } => {
102 "EntityTargeting".to_string()
103 }
104 datasynth_core::models::AnomalyCausalReason::ClusterMembership { .. } => {
105 "ClusterMembership".to_string()
106 }
107 datasynth_core::models::AnomalyCausalReason::ScenarioStep { .. } => {
108 "ScenarioStep".to_string()
109 }
110 datasynth_core::models::AnomalyCausalReason::DataQualityProfile { .. } => {
111 "DataQualityProfile".to_string()
112 }
113 datasynth_core::models::AnomalyCausalReason::MLTrainingBalance { .. } => {
114 "MLTrainingBalance".to_string()
115 }
116 }),
117 causal_reason_json: label
118 .causal_reason
119 .as_ref()
120 .map(|r| serde_json::to_string(r).unwrap_or_default()),
121 parent_anomaly_id: label.parent_anomaly_id.clone(),
122 child_anomaly_ids: serde_json::to_string(&label.child_anomaly_ids).unwrap_or_default(),
123 scenario_id: label.scenario_id.clone(),
124 run_id: label.run_id.clone(),
125 generation_seed: label.generation_seed,
126
127 metadata_json: serde_json::to_string(&label.metadata).unwrap_or_default(),
128 }
129 }
130}
131
132#[derive(Debug, Clone)]
134pub struct LabelExportConfig {
135 pub include_provenance: bool,
137 pub include_metadata: bool,
139 pub pretty_json: bool,
141}
142
143impl Default for LabelExportConfig {
144 fn default() -> Self {
145 Self {
146 include_provenance: true,
147 include_metadata: true,
148 pretty_json: true,
149 }
150 }
151}
152
153pub fn export_labels_csv(
155 labels: &[LabeledAnomaly],
156 path: &Path,
157 _config: &LabelExportConfig,
158) -> LabelExportResult<usize> {
159 let file = File::create(path)?;
160 let mut writer = csv::Writer::from_writer(BufWriter::new(file));
161
162 for label in labels {
163 let flat: FlatAnomalyLabel = label.into();
164 writer.serialize(&flat)?;
165 }
166
167 writer.flush()?;
168 Ok(labels.len())
169}
170
171pub fn export_labels_json(
173 labels: &[LabeledAnomaly],
174 path: &Path,
175 config: &LabelExportConfig,
176) -> LabelExportResult<usize> {
177 let file = File::create(path)?;
178 let writer = BufWriter::new(file);
179
180 if config.pretty_json {
181 serde_json::to_writer_pretty(writer, labels)?;
182 } else {
183 serde_json::to_writer(writer, labels)?;
184 }
185
186 Ok(labels.len())
187}
188
189pub fn export_labels_jsonl(
191 labels: &[LabeledAnomaly],
192 path: &Path,
193 _config: &LabelExportConfig,
194) -> LabelExportResult<usize> {
195 let file = File::create(path)?;
196 let mut writer = BufWriter::new(file);
197
198 for label in labels {
199 let json = serde_json::to_string(label)?;
200 writeln!(writer, "{}", json)?;
201 }
202
203 writer.flush()?;
204 Ok(labels.len())
205}
206
207pub fn export_labels_all_formats(
209 labels: &[LabeledAnomaly],
210 output_dir: &Path,
211 base_name: &str,
212 config: &LabelExportConfig,
213) -> LabelExportResult<Vec<(String, usize)>> {
214 std::fs::create_dir_all(output_dir)?;
215
216 let mut results = Vec::new();
217
218 let csv_path = output_dir.join(format!("{}.csv", base_name));
220 let count = export_labels_csv(labels, &csv_path, config)?;
221 results.push((csv_path.display().to_string(), count));
222
223 let json_path = output_dir.join(format!("{}.json", base_name));
225 let count = export_labels_json(labels, &json_path, config)?;
226 results.push((json_path.display().to_string(), count));
227
228 let jsonl_path = output_dir.join(format!("{}.jsonl", base_name));
230 let count = export_labels_jsonl(labels, &jsonl_path, config)?;
231 results.push((jsonl_path.display().to_string(), count));
232
233 Ok(results)
234}
235
236#[derive(Debug, Clone, Serialize)]
238pub struct LabelExportSummary {
239 pub total_labels: usize,
241 pub by_category: std::collections::HashMap<String, usize>,
243 pub by_company: std::collections::HashMap<String, usize>,
245 pub with_provenance: usize,
247 pub in_scenarios: usize,
249 pub in_clusters: usize,
251}
252
253impl LabelExportSummary {
254 pub fn from_labels(labels: &[LabeledAnomaly]) -> Self {
256 let mut by_category = std::collections::HashMap::new();
257 let mut by_company = std::collections::HashMap::new();
258 let mut with_provenance = 0;
259 let mut in_scenarios = 0;
260 let mut in_clusters = 0;
261
262 for label in labels {
263 *by_category
264 .entry(label.anomaly_type.category().to_string())
265 .or_insert(0) += 1;
266 *by_company.entry(label.company_code.clone()).or_insert(0) += 1;
267
268 if label.causal_reason.is_some() || label.structured_strategy.is_some() {
269 with_provenance += 1;
270 }
271 if label.scenario_id.is_some() {
272 in_scenarios += 1;
273 }
274 if label.cluster_id.is_some() {
275 in_clusters += 1;
276 }
277 }
278
279 Self {
280 total_labels: labels.len(),
281 by_category,
282 by_company,
283 with_provenance,
284 in_scenarios,
285 in_clusters,
286 }
287 }
288
289 pub fn write_to_file(&self, path: &Path) -> LabelExportResult<()> {
291 let file = File::create(path)?;
292 let writer = BufWriter::new(file);
293 serde_json::to_writer_pretty(writer, self)?;
294 Ok(())
295 }
296}
297
298#[cfg(test)]
299mod tests {
300 use super::*;
301 use chrono::NaiveDate;
302 use datasynth_core::models::{AnomalyCausalReason, AnomalyType, FraudType};
303 use tempfile::TempDir;
304
305 fn create_test_labels() -> Vec<LabeledAnomaly> {
306 vec![
307 LabeledAnomaly::new(
308 "ANO001".to_string(),
309 AnomalyType::Fraud(FraudType::SelfApproval),
310 "JE001".to_string(),
311 "JE".to_string(),
312 "1000".to_string(),
313 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
314 )
315 .with_run_id("run-123")
316 .with_generation_seed(42)
317 .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 }),
318 LabeledAnomaly::new(
319 "ANO002".to_string(),
320 AnomalyType::Fraud(FraudType::DuplicatePayment),
321 "JE002".to_string(),
322 "JE".to_string(),
323 "1000".to_string(),
324 NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
325 )
326 .with_cluster("cluster-1"),
327 ]
328 }
329
330 #[test]
331 fn test_export_csv() {
332 let temp_dir = TempDir::new().unwrap();
333 let labels = create_test_labels();
334 let config = LabelExportConfig::default();
335
336 let path = temp_dir.path().join("labels.csv");
337 let count = export_labels_csv(&labels, &path, &config).unwrap();
338
339 assert_eq!(count, 2);
340 assert!(path.exists());
341
342 let contents = std::fs::read_to_string(&path).unwrap();
343 assert!(contents.contains("ANO001"));
344 assert!(contents.contains("SelfApproval"));
345 }
346
347 #[test]
348 fn test_export_json() {
349 let temp_dir = TempDir::new().unwrap();
350 let labels = create_test_labels();
351 let config = LabelExportConfig::default();
352
353 let path = temp_dir.path().join("labels.json");
354 let count = export_labels_json(&labels, &path, &config).unwrap();
355
356 assert_eq!(count, 2);
357 assert!(path.exists());
358
359 let contents = std::fs::read_to_string(&path).unwrap();
360 assert!(contents.contains("ANO001"));
361 assert!(contents.contains("run-123"));
362 }
363
364 #[test]
365 fn test_export_jsonl() {
366 let temp_dir = TempDir::new().unwrap();
367 let labels = create_test_labels();
368 let config = LabelExportConfig::default();
369
370 let path = temp_dir.path().join("labels.jsonl");
371 let count = export_labels_jsonl(&labels, &path, &config).unwrap();
372
373 assert_eq!(count, 2);
374 assert!(path.exists());
375
376 let contents = std::fs::read_to_string(&path).unwrap();
377 let lines: Vec<&str> = contents.lines().collect();
378 assert_eq!(lines.len(), 2);
379 }
380
381 #[test]
382 fn test_export_all_formats() {
383 let temp_dir = TempDir::new().unwrap();
384 let labels = create_test_labels();
385 let config = LabelExportConfig::default();
386
387 let results =
388 export_labels_all_formats(&labels, temp_dir.path(), "anomaly_labels", &config).unwrap();
389
390 assert_eq!(results.len(), 3);
391 assert!(temp_dir.path().join("anomaly_labels.csv").exists());
392 assert!(temp_dir.path().join("anomaly_labels.json").exists());
393 assert!(temp_dir.path().join("anomaly_labels.jsonl").exists());
394 }
395
396 #[test]
397 fn test_label_export_summary() {
398 let labels = create_test_labels();
399 let summary = LabelExportSummary::from_labels(&labels);
400
401 assert_eq!(summary.total_labels, 2);
402 assert_eq!(summary.by_category.get("Fraud"), Some(&2));
403 assert_eq!(summary.with_provenance, 1);
404 assert_eq!(summary.in_clusters, 1);
405 }
406
407 #[test]
408 fn test_flat_label_conversion() {
409 let label = LabeledAnomaly::new(
410 "ANO001".to_string(),
411 AnomalyType::Fraud(FraudType::SelfApproval),
412 "JE001".to_string(),
413 "JE".to_string(),
414 "1000".to_string(),
415 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
416 )
417 .with_run_id("run-123")
418 .with_causal_reason(AnomalyCausalReason::RandomRate { base_rate: 0.02 });
419
420 let flat: FlatAnomalyLabel = (&label).into();
421
422 assert_eq!(flat.anomaly_id, "ANO001");
423 assert_eq!(flat.anomaly_category, "Fraud");
424 assert_eq!(flat.run_id, Some("run-123".to_string()));
425 assert_eq!(flat.causal_reason_type, Some("RandomRate".to_string()));
426 }
427}