Skip to main content

datasynth_generators/data_quality/
labels.rs

1//! Quality issue labels for ML training and data quality tracking.
2//!
3//! This module provides labeling structures for tracking data quality issues
4//! injected into synthetic data. Labels can be exported for use in training
5//! ML models for data quality detection.
6
7use serde::{Deserialize, Serialize};
8
9use datasynth_core::uuid_factory::{DeterministicUuidFactory, GeneratorType};
10
11/// Type of data quality issue.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum LabeledIssueType {
15    /// Missing value (null, empty, or placeholder)
16    MissingValue,
17    /// Typo or character error
18    Typo,
19    /// Format variation from standard
20    FormatVariation,
21    /// Duplicate record (exact or near)
22    Duplicate,
23    /// Encoding issue (mojibake, corruption)
24    EncodingIssue,
25    /// Inconsistent data (e.g., mismatched formats)
26    Inconsistency,
27    /// Out of range value
28    OutOfRange,
29    /// Invalid reference (foreign key violation)
30    InvalidReference,
31}
32
33impl LabeledIssueType {
34    /// Get the display name for this issue type.
35    pub fn display_name(&self) -> &'static str {
36        match self {
37            LabeledIssueType::MissingValue => "Missing Value",
38            LabeledIssueType::Typo => "Typo",
39            LabeledIssueType::FormatVariation => "Format Variation",
40            LabeledIssueType::Duplicate => "Duplicate",
41            LabeledIssueType::EncodingIssue => "Encoding Issue",
42            LabeledIssueType::Inconsistency => "Inconsistency",
43            LabeledIssueType::OutOfRange => "Out of Range",
44            LabeledIssueType::InvalidReference => "Invalid Reference",
45        }
46    }
47
48    /// Get the severity level (1-5, with 5 being most severe).
49    pub fn default_severity(&self) -> u8 {
50        match self {
51            LabeledIssueType::MissingValue => 3,
52            LabeledIssueType::Typo => 2,
53            LabeledIssueType::FormatVariation => 1,
54            LabeledIssueType::Duplicate => 4,
55            LabeledIssueType::EncodingIssue => 3,
56            LabeledIssueType::Inconsistency => 2,
57            LabeledIssueType::OutOfRange => 4,
58            LabeledIssueType::InvalidReference => 5,
59        }
60    }
61}
62
63/// Subtype providing more detail about the issue.
64#[derive(Debug, Clone, Serialize, Deserialize)]
65#[serde(rename_all = "snake_case")]
66pub enum QualityIssueSubtype {
67    // Missing value subtypes
68    NullValue,
69    EmptyString,
70    Placeholder,
71    SystematicMissing,
72
73    // Typo subtypes
74    Substitution,
75    Transposition,
76    Insertion,
77    Deletion,
78    OcrError,
79    Homophone,
80
81    // Format variation subtypes
82    DateFormatVariation,
83    AmountFormatVariation,
84    IdentifierFormatVariation,
85    CaseVariation,
86
87    // Duplicate subtypes
88    ExactDuplicate,
89    NearDuplicate,
90    FuzzyDuplicate,
91
92    // Encoding subtypes
93    Mojibake,
94    HtmlEntityCorruption,
95    BomIssue,
96    CharacterCorruption,
97
98    // Generic
99    Other(String),
100}
101
102/// A label describing a data quality issue for ML training.
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct QualityIssueLabel {
105    /// Unique identifier for this issue
106    pub issue_id: String,
107    /// Type of quality issue
108    pub issue_type: LabeledIssueType,
109    /// More specific subtype
110    pub subtype: Option<QualityIssueSubtype>,
111    /// ID of the affected document/record
112    pub document_id: String,
113    /// Name of the affected field
114    pub field_name: String,
115    /// Original value before modification (if available)
116    pub original_value: Option<String>,
117    /// Modified/corrupted value (if applicable)
118    pub modified_value: Option<String>,
119    /// Severity level (1-5)
120    pub severity: u8,
121    /// Name of the processor that created this issue
122    pub processor: String,
123    /// Additional metadata
124    #[serde(default)]
125    pub metadata: std::collections::HashMap<String, String>,
126}
127
128impl QualityIssueLabel {
129    /// Create a new quality issue label.
130    pub fn new(
131        issue_type: LabeledIssueType,
132        document_id: impl Into<String>,
133        field_name: impl Into<String>,
134        processor: impl Into<String>,
135    ) -> Self {
136        let uuid_factory = DeterministicUuidFactory::new(0, GeneratorType::Anomaly);
137        Self {
138            issue_id: uuid_factory.next().to_string(),
139            issue_type,
140            subtype: None,
141            document_id: document_id.into(),
142            field_name: field_name.into(),
143            original_value: None,
144            modified_value: None,
145            severity: issue_type.default_severity(),
146            processor: processor.into(),
147            metadata: std::collections::HashMap::new(),
148        }
149    }
150
151    /// Set the subtype.
152    pub fn with_subtype(mut self, subtype: QualityIssueSubtype) -> Self {
153        self.subtype = Some(subtype);
154        self
155    }
156
157    /// Set the original value.
158    pub fn with_original(mut self, value: impl Into<String>) -> Self {
159        self.original_value = Some(value.into());
160        self
161    }
162
163    /// Set the modified value.
164    pub fn with_modified(mut self, value: impl Into<String>) -> Self {
165        self.modified_value = Some(value.into());
166        self
167    }
168
169    /// Set both original and modified values.
170    pub fn with_values(mut self, original: impl Into<String>, modified: impl Into<String>) -> Self {
171        self.original_value = Some(original.into());
172        self.modified_value = Some(modified.into());
173        self
174    }
175
176    /// Set the severity level.
177    pub fn with_severity(mut self, severity: u8) -> Self {
178        self.severity = severity.clamp(1, 5);
179        self
180    }
181
182    /// Add metadata.
183    pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
184        self.metadata.insert(key.into(), value.into());
185        self
186    }
187
188    /// Create a missing value label.
189    pub fn missing_value(
190        document_id: impl Into<String>,
191        field_name: impl Into<String>,
192        processor: impl Into<String>,
193    ) -> Self {
194        Self::new(
195            LabeledIssueType::MissingValue,
196            document_id,
197            field_name,
198            processor,
199        )
200    }
201
202    /// Create a typo label.
203    pub fn typo(
204        document_id: impl Into<String>,
205        field_name: impl Into<String>,
206        original: impl Into<String>,
207        modified: impl Into<String>,
208        processor: impl Into<String>,
209    ) -> Self {
210        Self::new(LabeledIssueType::Typo, document_id, field_name, processor)
211            .with_values(original, modified)
212    }
213
214    /// Create a format variation label.
215    pub fn format_variation(
216        document_id: impl Into<String>,
217        field_name: impl Into<String>,
218        original: impl Into<String>,
219        modified: impl Into<String>,
220        processor: impl Into<String>,
221    ) -> Self {
222        Self::new(
223            LabeledIssueType::FormatVariation,
224            document_id,
225            field_name,
226            processor,
227        )
228        .with_values(original, modified)
229    }
230
231    /// Create a duplicate label.
232    pub fn duplicate(
233        document_id: impl Into<String>,
234        original_doc_id: impl Into<String>,
235        processor: impl Into<String>,
236    ) -> Self {
237        Self::new(
238            LabeledIssueType::Duplicate,
239            document_id,
240            "_record",
241            processor,
242        )
243        .with_metadata("original_document_id", original_doc_id)
244    }
245}
246
247/// Collection of quality issue labels with aggregation methods.
248#[derive(Debug, Clone, Default, Serialize, Deserialize)]
249pub struct QualityLabels {
250    /// All labels in this collection
251    pub labels: Vec<QualityIssueLabel>,
252}
253
254impl QualityLabels {
255    /// Create a new empty label collection.
256    pub fn new() -> Self {
257        Self { labels: Vec::new() }
258    }
259
260    /// Create with pre-allocated capacity.
261    pub fn with_capacity(capacity: usize) -> Self {
262        Self {
263            labels: Vec::with_capacity(capacity),
264        }
265    }
266
267    /// Add a label.
268    pub fn add(&mut self, label: QualityIssueLabel) {
269        self.labels.push(label);
270    }
271
272    /// Extend with more labels.
273    pub fn extend(&mut self, labels: impl IntoIterator<Item = QualityIssueLabel>) {
274        self.labels.extend(labels);
275    }
276
277    /// Get total number of labels.
278    pub fn len(&self) -> usize {
279        self.labels.len()
280    }
281
282    /// Check if empty.
283    pub fn is_empty(&self) -> bool {
284        self.labels.is_empty()
285    }
286
287    /// Count labels by type.
288    pub fn count_by_type(&self) -> std::collections::HashMap<LabeledIssueType, usize> {
289        let mut counts = std::collections::HashMap::new();
290        for label in &self.labels {
291            *counts.entry(label.issue_type).or_insert(0) += 1;
292        }
293        counts
294    }
295
296    /// Count labels by processor.
297    pub fn count_by_processor(&self) -> std::collections::HashMap<String, usize> {
298        let mut counts = std::collections::HashMap::new();
299        for label in &self.labels {
300            *counts.entry(label.processor.clone()).or_insert(0) += 1;
301        }
302        counts
303    }
304
305    /// Get labels for a specific document.
306    pub fn for_document(&self, document_id: &str) -> Vec<&QualityIssueLabel> {
307        self.labels
308            .iter()
309            .filter(|l| l.document_id == document_id)
310            .collect()
311    }
312
313    /// Get labels for a specific field.
314    pub fn for_field(&self, field_name: &str) -> Vec<&QualityIssueLabel> {
315        self.labels
316            .iter()
317            .filter(|l| l.field_name == field_name)
318            .collect()
319    }
320
321    /// Get labels of a specific type.
322    pub fn of_type(&self, issue_type: LabeledIssueType) -> Vec<&QualityIssueLabel> {
323        self.labels
324            .iter()
325            .filter(|l| l.issue_type == issue_type)
326            .collect()
327    }
328
329    /// Get summary statistics.
330    pub fn summary(&self) -> QualityLabelSummary {
331        let counts = self.count_by_type();
332        QualityLabelSummary {
333            total_labels: self.labels.len(),
334            missing_values: *counts.get(&LabeledIssueType::MissingValue).unwrap_or(&0),
335            typos: *counts.get(&LabeledIssueType::Typo).unwrap_or(&0),
336            format_variations: *counts.get(&LabeledIssueType::FormatVariation).unwrap_or(&0),
337            duplicates: *counts.get(&LabeledIssueType::Duplicate).unwrap_or(&0),
338            encoding_issues: *counts.get(&LabeledIssueType::EncodingIssue).unwrap_or(&0),
339            unique_documents: self
340                .labels
341                .iter()
342                .map(|l| &l.document_id)
343                .collect::<std::collections::HashSet<_>>()
344                .len(),
345            unique_fields: self
346                .labels
347                .iter()
348                .map(|l| &l.field_name)
349                .collect::<std::collections::HashSet<_>>()
350                .len(),
351        }
352    }
353
354    /// Convert to CSV rows.
355    pub fn to_csv_rows(&self) -> Vec<Vec<String>> {
356        self.labels
357            .iter()
358            .map(|l| {
359                vec![
360                    l.issue_id.clone(),
361                    format!("{:?}", l.issue_type),
362                    l.subtype
363                        .as_ref()
364                        .map(|s| format!("{:?}", s))
365                        .unwrap_or_default(),
366                    l.document_id.clone(),
367                    l.field_name.clone(),
368                    l.original_value.clone().unwrap_or_default(),
369                    l.modified_value.clone().unwrap_or_default(),
370                    l.severity.to_string(),
371                    l.processor.clone(),
372                ]
373            })
374            .collect()
375    }
376
377    /// Get CSV header.
378    pub fn csv_header() -> Vec<&'static str> {
379        vec![
380            "issue_id",
381            "issue_type",
382            "subtype",
383            "document_id",
384            "field_name",
385            "original_value",
386            "modified_value",
387            "severity",
388            "processor",
389        ]
390    }
391}
392
393/// Summary statistics for quality labels.
394#[derive(Debug, Clone, Default, Serialize, Deserialize)]
395pub struct QualityLabelSummary {
396    /// Total number of labels
397    pub total_labels: usize,
398    /// Number of missing value issues
399    pub missing_values: usize,
400    /// Number of typo issues
401    pub typos: usize,
402    /// Number of format variation issues
403    pub format_variations: usize,
404    /// Number of duplicate issues
405    pub duplicates: usize,
406    /// Number of encoding issues
407    pub encoding_issues: usize,
408    /// Number of unique documents affected
409    pub unique_documents: usize,
410    /// Number of unique fields affected
411    pub unique_fields: usize,
412}
413
414#[cfg(test)]
415#[allow(clippy::unwrap_used)]
416mod tests {
417    use super::*;
418
419    #[test]
420    fn test_label_creation() {
421        let label = QualityIssueLabel::new(
422            LabeledIssueType::Typo,
423            "doc-123",
424            "vendor_name",
425            "typo_processor",
426        )
427        .with_values("Acme Corp", "Acne Corp")
428        .with_subtype(QualityIssueSubtype::Substitution);
429
430        assert_eq!(label.issue_type, LabeledIssueType::Typo);
431        assert_eq!(label.document_id, "doc-123");
432        assert_eq!(label.field_name, "vendor_name");
433        assert_eq!(label.original_value, Some("Acme Corp".to_string()));
434        assert_eq!(label.modified_value, Some("Acne Corp".to_string()));
435    }
436
437    #[test]
438    fn test_label_helpers() {
439        let missing = QualityIssueLabel::missing_value("doc-1", "amount", "missing_processor");
440        assert_eq!(missing.issue_type, LabeledIssueType::MissingValue);
441
442        let typo = QualityIssueLabel::typo("doc-2", "name", "John", "Jphn", "typo_processor");
443        assert_eq!(typo.issue_type, LabeledIssueType::Typo);
444        assert_eq!(typo.original_value, Some("John".to_string()));
445
446        let duplicate = QualityIssueLabel::duplicate("doc-3", "doc-1", "dup_processor");
447        assert_eq!(duplicate.issue_type, LabeledIssueType::Duplicate);
448    }
449
450    #[test]
451    fn test_quality_labels_collection() {
452        let mut labels = QualityLabels::new();
453        labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
454        labels.add(QualityIssueLabel::typo(
455            "doc-1", "field2", "a", "b", "proc2",
456        ));
457        labels.add(QualityIssueLabel::typo(
458            "doc-2", "field1", "x", "y", "proc2",
459        ));
460
461        assert_eq!(labels.len(), 3);
462
463        let counts = labels.count_by_type();
464        assert_eq!(*counts.get(&LabeledIssueType::MissingValue).unwrap(), 1);
465        assert_eq!(*counts.get(&LabeledIssueType::Typo).unwrap(), 2);
466
467        let doc1_labels = labels.for_document("doc-1");
468        assert_eq!(doc1_labels.len(), 2);
469    }
470
471    #[test]
472    fn test_summary() {
473        let mut labels = QualityLabels::new();
474        labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
475        labels.add(QualityIssueLabel::typo(
476            "doc-1", "field2", "a", "b", "proc2",
477        ));
478        labels.add(QualityIssueLabel::format_variation(
479            "doc-2",
480            "date",
481            "2024-01-01",
482            "01/01/2024",
483            "proc3",
484        ));
485
486        let summary = labels.summary();
487        assert_eq!(summary.total_labels, 3);
488        assert_eq!(summary.missing_values, 1);
489        assert_eq!(summary.typos, 1);
490        assert_eq!(summary.format_variations, 1);
491        assert_eq!(summary.unique_documents, 2);
492        assert_eq!(summary.unique_fields, 3);
493    }
494
495    #[test]
496    fn test_csv_export() {
497        let mut labels = QualityLabels::new();
498        labels.add(QualityIssueLabel::typo(
499            "doc-1",
500            "name",
501            "Test",
502            "Tset",
503            "typo_proc",
504        ));
505
506        let header = QualityLabels::csv_header();
507        assert_eq!(header.len(), 9);
508
509        let rows = labels.to_csv_rows();
510        assert_eq!(rows.len(), 1);
511        assert_eq!(rows[0].len(), 9);
512    }
513}