1use serde::{Deserialize, Serialize};
8
9use datasynth_core::uuid_factory::{DeterministicUuidFactory, GeneratorType};
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum LabeledIssueType {
15 MissingValue,
17 Typo,
19 FormatVariation,
21 Duplicate,
23 EncodingIssue,
25 Inconsistency,
27 OutOfRange,
29 InvalidReference,
31}
32
33impl LabeledIssueType {
34 pub fn display_name(&self) -> &'static str {
36 match self {
37 LabeledIssueType::MissingValue => "Missing Value",
38 LabeledIssueType::Typo => "Typo",
39 LabeledIssueType::FormatVariation => "Format Variation",
40 LabeledIssueType::Duplicate => "Duplicate",
41 LabeledIssueType::EncodingIssue => "Encoding Issue",
42 LabeledIssueType::Inconsistency => "Inconsistency",
43 LabeledIssueType::OutOfRange => "Out of Range",
44 LabeledIssueType::InvalidReference => "Invalid Reference",
45 }
46 }
47
48 pub fn default_severity(&self) -> u8 {
50 match self {
51 LabeledIssueType::MissingValue => 3,
52 LabeledIssueType::Typo => 2,
53 LabeledIssueType::FormatVariation => 1,
54 LabeledIssueType::Duplicate => 4,
55 LabeledIssueType::EncodingIssue => 3,
56 LabeledIssueType::Inconsistency => 2,
57 LabeledIssueType::OutOfRange => 4,
58 LabeledIssueType::InvalidReference => 5,
59 }
60 }
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65#[serde(rename_all = "snake_case")]
66pub enum QualityIssueSubtype {
67 NullValue,
69 EmptyString,
70 Placeholder,
71 SystematicMissing,
72
73 Substitution,
75 Transposition,
76 Insertion,
77 Deletion,
78 OcrError,
79 Homophone,
80
81 DateFormatVariation,
83 AmountFormatVariation,
84 IdentifierFormatVariation,
85 CaseVariation,
86
87 ExactDuplicate,
89 NearDuplicate,
90 FuzzyDuplicate,
91
92 Mojibake,
94 HtmlEntityCorruption,
95 BomIssue,
96 CharacterCorruption,
97
98 Other(String),
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct QualityIssueLabel {
105 pub issue_id: String,
107 pub issue_type: LabeledIssueType,
109 pub subtype: Option<QualityIssueSubtype>,
111 pub document_id: String,
113 pub field_name: String,
115 pub original_value: Option<String>,
117 pub modified_value: Option<String>,
119 pub severity: u8,
121 pub processor: String,
123 #[serde(default)]
125 pub metadata: std::collections::HashMap<String, String>,
126}
127
128impl QualityIssueLabel {
129 pub fn new(
131 issue_type: LabeledIssueType,
132 document_id: impl Into<String>,
133 field_name: impl Into<String>,
134 processor: impl Into<String>,
135 ) -> Self {
136 let uuid_factory = DeterministicUuidFactory::new(0, GeneratorType::Anomaly);
137 Self {
138 issue_id: uuid_factory.next().to_string(),
139 issue_type,
140 subtype: None,
141 document_id: document_id.into(),
142 field_name: field_name.into(),
143 original_value: None,
144 modified_value: None,
145 severity: issue_type.default_severity(),
146 processor: processor.into(),
147 metadata: std::collections::HashMap::new(),
148 }
149 }
150
151 pub fn with_subtype(mut self, subtype: QualityIssueSubtype) -> Self {
153 self.subtype = Some(subtype);
154 self
155 }
156
157 pub fn with_original(mut self, value: impl Into<String>) -> Self {
159 self.original_value = Some(value.into());
160 self
161 }
162
163 pub fn with_modified(mut self, value: impl Into<String>) -> Self {
165 self.modified_value = Some(value.into());
166 self
167 }
168
169 pub fn with_values(mut self, original: impl Into<String>, modified: impl Into<String>) -> Self {
171 self.original_value = Some(original.into());
172 self.modified_value = Some(modified.into());
173 self
174 }
175
176 pub fn with_severity(mut self, severity: u8) -> Self {
178 self.severity = severity.clamp(1, 5);
179 self
180 }
181
182 pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
184 self.metadata.insert(key.into(), value.into());
185 self
186 }
187
188 pub fn missing_value(
190 document_id: impl Into<String>,
191 field_name: impl Into<String>,
192 processor: impl Into<String>,
193 ) -> Self {
194 Self::new(
195 LabeledIssueType::MissingValue,
196 document_id,
197 field_name,
198 processor,
199 )
200 }
201
202 pub fn typo(
204 document_id: impl Into<String>,
205 field_name: impl Into<String>,
206 original: impl Into<String>,
207 modified: impl Into<String>,
208 processor: impl Into<String>,
209 ) -> Self {
210 Self::new(LabeledIssueType::Typo, document_id, field_name, processor)
211 .with_values(original, modified)
212 }
213
214 pub fn format_variation(
216 document_id: impl Into<String>,
217 field_name: impl Into<String>,
218 original: impl Into<String>,
219 modified: impl Into<String>,
220 processor: impl Into<String>,
221 ) -> Self {
222 Self::new(
223 LabeledIssueType::FormatVariation,
224 document_id,
225 field_name,
226 processor,
227 )
228 .with_values(original, modified)
229 }
230
231 pub fn duplicate(
233 document_id: impl Into<String>,
234 original_doc_id: impl Into<String>,
235 processor: impl Into<String>,
236 ) -> Self {
237 Self::new(
238 LabeledIssueType::Duplicate,
239 document_id,
240 "_record",
241 processor,
242 )
243 .with_metadata("original_document_id", original_doc_id)
244 }
245}
246
247#[derive(Debug, Clone, Default, Serialize, Deserialize)]
249pub struct QualityLabels {
250 pub labels: Vec<QualityIssueLabel>,
252}
253
254impl QualityLabels {
255 pub fn new() -> Self {
257 Self { labels: Vec::new() }
258 }
259
260 pub fn with_capacity(capacity: usize) -> Self {
262 Self {
263 labels: Vec::with_capacity(capacity),
264 }
265 }
266
267 pub fn add(&mut self, label: QualityIssueLabel) {
269 self.labels.push(label);
270 }
271
272 pub fn extend(&mut self, labels: impl IntoIterator<Item = QualityIssueLabel>) {
274 self.labels.extend(labels);
275 }
276
277 pub fn len(&self) -> usize {
279 self.labels.len()
280 }
281
282 pub fn is_empty(&self) -> bool {
284 self.labels.is_empty()
285 }
286
287 pub fn count_by_type(&self) -> std::collections::HashMap<LabeledIssueType, usize> {
289 let mut counts = std::collections::HashMap::new();
290 for label in &self.labels {
291 *counts.entry(label.issue_type).or_insert(0) += 1;
292 }
293 counts
294 }
295
296 pub fn count_by_processor(&self) -> std::collections::HashMap<String, usize> {
298 let mut counts = std::collections::HashMap::new();
299 for label in &self.labels {
300 *counts.entry(label.processor.clone()).or_insert(0) += 1;
301 }
302 counts
303 }
304
305 pub fn for_document(&self, document_id: &str) -> Vec<&QualityIssueLabel> {
307 self.labels
308 .iter()
309 .filter(|l| l.document_id == document_id)
310 .collect()
311 }
312
313 pub fn for_field(&self, field_name: &str) -> Vec<&QualityIssueLabel> {
315 self.labels
316 .iter()
317 .filter(|l| l.field_name == field_name)
318 .collect()
319 }
320
321 pub fn of_type(&self, issue_type: LabeledIssueType) -> Vec<&QualityIssueLabel> {
323 self.labels
324 .iter()
325 .filter(|l| l.issue_type == issue_type)
326 .collect()
327 }
328
329 pub fn summary(&self) -> QualityLabelSummary {
331 let counts = self.count_by_type();
332 QualityLabelSummary {
333 total_labels: self.labels.len(),
334 missing_values: *counts.get(&LabeledIssueType::MissingValue).unwrap_or(&0),
335 typos: *counts.get(&LabeledIssueType::Typo).unwrap_or(&0),
336 format_variations: *counts.get(&LabeledIssueType::FormatVariation).unwrap_or(&0),
337 duplicates: *counts.get(&LabeledIssueType::Duplicate).unwrap_or(&0),
338 encoding_issues: *counts.get(&LabeledIssueType::EncodingIssue).unwrap_or(&0),
339 unique_documents: self
340 .labels
341 .iter()
342 .map(|l| &l.document_id)
343 .collect::<std::collections::HashSet<_>>()
344 .len(),
345 unique_fields: self
346 .labels
347 .iter()
348 .map(|l| &l.field_name)
349 .collect::<std::collections::HashSet<_>>()
350 .len(),
351 }
352 }
353
354 pub fn to_csv_rows(&self) -> Vec<Vec<String>> {
356 self.labels
357 .iter()
358 .map(|l| {
359 vec![
360 l.issue_id.clone(),
361 format!("{:?}", l.issue_type),
362 l.subtype
363 .as_ref()
364 .map(|s| format!("{:?}", s))
365 .unwrap_or_default(),
366 l.document_id.clone(),
367 l.field_name.clone(),
368 l.original_value.clone().unwrap_or_default(),
369 l.modified_value.clone().unwrap_or_default(),
370 l.severity.to_string(),
371 l.processor.clone(),
372 ]
373 })
374 .collect()
375 }
376
377 pub fn csv_header() -> Vec<&'static str> {
379 vec![
380 "issue_id",
381 "issue_type",
382 "subtype",
383 "document_id",
384 "field_name",
385 "original_value",
386 "modified_value",
387 "severity",
388 "processor",
389 ]
390 }
391}
392
393#[derive(Debug, Clone, Default, Serialize, Deserialize)]
395pub struct QualityLabelSummary {
396 pub total_labels: usize,
398 pub missing_values: usize,
400 pub typos: usize,
402 pub format_variations: usize,
404 pub duplicates: usize,
406 pub encoding_issues: usize,
408 pub unique_documents: usize,
410 pub unique_fields: usize,
412}
413
414#[cfg(test)]
415#[allow(clippy::unwrap_used)]
416mod tests {
417 use super::*;
418
419 #[test]
420 fn test_label_creation() {
421 let label = QualityIssueLabel::new(
422 LabeledIssueType::Typo,
423 "doc-123",
424 "vendor_name",
425 "typo_processor",
426 )
427 .with_values("Acme Corp", "Acne Corp")
428 .with_subtype(QualityIssueSubtype::Substitution);
429
430 assert_eq!(label.issue_type, LabeledIssueType::Typo);
431 assert_eq!(label.document_id, "doc-123");
432 assert_eq!(label.field_name, "vendor_name");
433 assert_eq!(label.original_value, Some("Acme Corp".to_string()));
434 assert_eq!(label.modified_value, Some("Acne Corp".to_string()));
435 }
436
437 #[test]
438 fn test_label_helpers() {
439 let missing = QualityIssueLabel::missing_value("doc-1", "amount", "missing_processor");
440 assert_eq!(missing.issue_type, LabeledIssueType::MissingValue);
441
442 let typo = QualityIssueLabel::typo("doc-2", "name", "John", "Jphn", "typo_processor");
443 assert_eq!(typo.issue_type, LabeledIssueType::Typo);
444 assert_eq!(typo.original_value, Some("John".to_string()));
445
446 let duplicate = QualityIssueLabel::duplicate("doc-3", "doc-1", "dup_processor");
447 assert_eq!(duplicate.issue_type, LabeledIssueType::Duplicate);
448 }
449
450 #[test]
451 fn test_quality_labels_collection() {
452 let mut labels = QualityLabels::new();
453 labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
454 labels.add(QualityIssueLabel::typo(
455 "doc-1", "field2", "a", "b", "proc2",
456 ));
457 labels.add(QualityIssueLabel::typo(
458 "doc-2", "field1", "x", "y", "proc2",
459 ));
460
461 assert_eq!(labels.len(), 3);
462
463 let counts = labels.count_by_type();
464 assert_eq!(*counts.get(&LabeledIssueType::MissingValue).unwrap(), 1);
465 assert_eq!(*counts.get(&LabeledIssueType::Typo).unwrap(), 2);
466
467 let doc1_labels = labels.for_document("doc-1");
468 assert_eq!(doc1_labels.len(), 2);
469 }
470
471 #[test]
472 fn test_summary() {
473 let mut labels = QualityLabels::new();
474 labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
475 labels.add(QualityIssueLabel::typo(
476 "doc-1", "field2", "a", "b", "proc2",
477 ));
478 labels.add(QualityIssueLabel::format_variation(
479 "doc-2",
480 "date",
481 "2024-01-01",
482 "01/01/2024",
483 "proc3",
484 ));
485
486 let summary = labels.summary();
487 assert_eq!(summary.total_labels, 3);
488 assert_eq!(summary.missing_values, 1);
489 assert_eq!(summary.typos, 1);
490 assert_eq!(summary.format_variations, 1);
491 assert_eq!(summary.unique_documents, 2);
492 assert_eq!(summary.unique_fields, 3);
493 }
494
495 #[test]
496 fn test_csv_export() {
497 let mut labels = QualityLabels::new();
498 labels.add(QualityIssueLabel::typo(
499 "doc-1",
500 "name",
501 "Test",
502 "Tset",
503 "typo_proc",
504 ));
505
506 let header = QualityLabels::csv_header();
507 assert_eq!(header.len(), 9);
508
509 let rows = labels.to_csv_rows();
510 assert_eq!(rows.len(), 1);
511 assert_eq!(rows[0].len(), 9);
512 }
513}