datasynth_generators/data_quality/
labels.rs

1//! Quality issue labels for ML training and data quality tracking.
2//!
3//! This module provides labeling structures for tracking data quality issues
4//! injected into synthetic data. Labels can be exported for use in training
5//! ML models for data quality detection.
6
7use serde::{Deserialize, Serialize};
8use uuid::Uuid;
9
10/// Type of data quality issue.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12#[serde(rename_all = "snake_case")]
13pub enum LabeledIssueType {
14    /// Missing value (null, empty, or placeholder)
15    MissingValue,
16    /// Typo or character error
17    Typo,
18    /// Format variation from standard
19    FormatVariation,
20    /// Duplicate record (exact or near)
21    Duplicate,
22    /// Encoding issue (mojibake, corruption)
23    EncodingIssue,
24    /// Inconsistent data (e.g., mismatched formats)
25    Inconsistency,
26    /// Out of range value
27    OutOfRange,
28    /// Invalid reference (foreign key violation)
29    InvalidReference,
30}
31
32impl LabeledIssueType {
33    /// Get the display name for this issue type.
34    pub fn display_name(&self) -> &'static str {
35        match self {
36            LabeledIssueType::MissingValue => "Missing Value",
37            LabeledIssueType::Typo => "Typo",
38            LabeledIssueType::FormatVariation => "Format Variation",
39            LabeledIssueType::Duplicate => "Duplicate",
40            LabeledIssueType::EncodingIssue => "Encoding Issue",
41            LabeledIssueType::Inconsistency => "Inconsistency",
42            LabeledIssueType::OutOfRange => "Out of Range",
43            LabeledIssueType::InvalidReference => "Invalid Reference",
44        }
45    }
46
47    /// Get the severity level (1-5, with 5 being most severe).
48    pub fn default_severity(&self) -> u8 {
49        match self {
50            LabeledIssueType::MissingValue => 3,
51            LabeledIssueType::Typo => 2,
52            LabeledIssueType::FormatVariation => 1,
53            LabeledIssueType::Duplicate => 4,
54            LabeledIssueType::EncodingIssue => 3,
55            LabeledIssueType::Inconsistency => 2,
56            LabeledIssueType::OutOfRange => 4,
57            LabeledIssueType::InvalidReference => 5,
58        }
59    }
60}
61
62/// Subtype providing more detail about the issue.
63#[derive(Debug, Clone, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum QualityIssueSubtype {
66    // Missing value subtypes
67    NullValue,
68    EmptyString,
69    Placeholder,
70    SystematicMissing,
71
72    // Typo subtypes
73    Substitution,
74    Transposition,
75    Insertion,
76    Deletion,
77    OcrError,
78    Homophone,
79
80    // Format variation subtypes
81    DateFormatVariation,
82    AmountFormatVariation,
83    IdentifierFormatVariation,
84    CaseVariation,
85
86    // Duplicate subtypes
87    ExactDuplicate,
88    NearDuplicate,
89    FuzzyDuplicate,
90
91    // Encoding subtypes
92    Mojibake,
93    HtmlEntityCorruption,
94    BomIssue,
95    CharacterCorruption,
96
97    // Generic
98    Other(String),
99}
100
101/// A label describing a data quality issue for ML training.
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct QualityIssueLabel {
104    /// Unique identifier for this issue
105    pub issue_id: String,
106    /// Type of quality issue
107    pub issue_type: LabeledIssueType,
108    /// More specific subtype
109    pub subtype: Option<QualityIssueSubtype>,
110    /// ID of the affected document/record
111    pub document_id: String,
112    /// Name of the affected field
113    pub field_name: String,
114    /// Original value before modification (if available)
115    pub original_value: Option<String>,
116    /// Modified/corrupted value (if applicable)
117    pub modified_value: Option<String>,
118    /// Severity level (1-5)
119    pub severity: u8,
120    /// Name of the processor that created this issue
121    pub processor: String,
122    /// Additional metadata
123    #[serde(default)]
124    pub metadata: std::collections::HashMap<String, String>,
125}
126
127impl QualityIssueLabel {
128    /// Create a new quality issue label.
129    pub fn new(
130        issue_type: LabeledIssueType,
131        document_id: impl Into<String>,
132        field_name: impl Into<String>,
133        processor: impl Into<String>,
134    ) -> Self {
135        Self {
136            issue_id: Uuid::new_v4().to_string(),
137            issue_type,
138            subtype: None,
139            document_id: document_id.into(),
140            field_name: field_name.into(),
141            original_value: None,
142            modified_value: None,
143            severity: issue_type.default_severity(),
144            processor: processor.into(),
145            metadata: std::collections::HashMap::new(),
146        }
147    }
148
149    /// Set the subtype.
150    pub fn with_subtype(mut self, subtype: QualityIssueSubtype) -> Self {
151        self.subtype = Some(subtype);
152        self
153    }
154
155    /// Set the original value.
156    pub fn with_original(mut self, value: impl Into<String>) -> Self {
157        self.original_value = Some(value.into());
158        self
159    }
160
161    /// Set the modified value.
162    pub fn with_modified(mut self, value: impl Into<String>) -> Self {
163        self.modified_value = Some(value.into());
164        self
165    }
166
167    /// Set both original and modified values.
168    pub fn with_values(mut self, original: impl Into<String>, modified: impl Into<String>) -> Self {
169        self.original_value = Some(original.into());
170        self.modified_value = Some(modified.into());
171        self
172    }
173
174    /// Set the severity level.
175    pub fn with_severity(mut self, severity: u8) -> Self {
176        self.severity = severity.clamp(1, 5);
177        self
178    }
179
180    /// Add metadata.
181    pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
182        self.metadata.insert(key.into(), value.into());
183        self
184    }
185
186    /// Create a missing value label.
187    pub fn missing_value(
188        document_id: impl Into<String>,
189        field_name: impl Into<String>,
190        processor: impl Into<String>,
191    ) -> Self {
192        Self::new(
193            LabeledIssueType::MissingValue,
194            document_id,
195            field_name,
196            processor,
197        )
198    }
199
200    /// Create a typo label.
201    pub fn typo(
202        document_id: impl Into<String>,
203        field_name: impl Into<String>,
204        original: impl Into<String>,
205        modified: impl Into<String>,
206        processor: impl Into<String>,
207    ) -> Self {
208        Self::new(LabeledIssueType::Typo, document_id, field_name, processor)
209            .with_values(original, modified)
210    }
211
212    /// Create a format variation label.
213    pub fn format_variation(
214        document_id: impl Into<String>,
215        field_name: impl Into<String>,
216        original: impl Into<String>,
217        modified: impl Into<String>,
218        processor: impl Into<String>,
219    ) -> Self {
220        Self::new(
221            LabeledIssueType::FormatVariation,
222            document_id,
223            field_name,
224            processor,
225        )
226        .with_values(original, modified)
227    }
228
229    /// Create a duplicate label.
230    pub fn duplicate(
231        document_id: impl Into<String>,
232        original_doc_id: impl Into<String>,
233        processor: impl Into<String>,
234    ) -> Self {
235        Self::new(
236            LabeledIssueType::Duplicate,
237            document_id,
238            "_record",
239            processor,
240        )
241        .with_metadata("original_document_id", original_doc_id)
242    }
243}
244
245/// Collection of quality issue labels with aggregation methods.
246#[derive(Debug, Clone, Default, Serialize, Deserialize)]
247pub struct QualityLabels {
248    /// All labels in this collection
249    pub labels: Vec<QualityIssueLabel>,
250}
251
252impl QualityLabels {
253    /// Create a new empty label collection.
254    pub fn new() -> Self {
255        Self { labels: Vec::new() }
256    }
257
258    /// Create with pre-allocated capacity.
259    pub fn with_capacity(capacity: usize) -> Self {
260        Self {
261            labels: Vec::with_capacity(capacity),
262        }
263    }
264
265    /// Add a label.
266    pub fn add(&mut self, label: QualityIssueLabel) {
267        self.labels.push(label);
268    }
269
270    /// Extend with more labels.
271    pub fn extend(&mut self, labels: impl IntoIterator<Item = QualityIssueLabel>) {
272        self.labels.extend(labels);
273    }
274
275    /// Get total number of labels.
276    pub fn len(&self) -> usize {
277        self.labels.len()
278    }
279
280    /// Check if empty.
281    pub fn is_empty(&self) -> bool {
282        self.labels.is_empty()
283    }
284
285    /// Count labels by type.
286    pub fn count_by_type(&self) -> std::collections::HashMap<LabeledIssueType, usize> {
287        let mut counts = std::collections::HashMap::new();
288        for label in &self.labels {
289            *counts.entry(label.issue_type).or_insert(0) += 1;
290        }
291        counts
292    }
293
294    /// Count labels by processor.
295    pub fn count_by_processor(&self) -> std::collections::HashMap<String, usize> {
296        let mut counts = std::collections::HashMap::new();
297        for label in &self.labels {
298            *counts.entry(label.processor.clone()).or_insert(0) += 1;
299        }
300        counts
301    }
302
303    /// Get labels for a specific document.
304    pub fn for_document(&self, document_id: &str) -> Vec<&QualityIssueLabel> {
305        self.labels
306            .iter()
307            .filter(|l| l.document_id == document_id)
308            .collect()
309    }
310
311    /// Get labels for a specific field.
312    pub fn for_field(&self, field_name: &str) -> Vec<&QualityIssueLabel> {
313        self.labels
314            .iter()
315            .filter(|l| l.field_name == field_name)
316            .collect()
317    }
318
319    /// Get labels of a specific type.
320    pub fn of_type(&self, issue_type: LabeledIssueType) -> Vec<&QualityIssueLabel> {
321        self.labels
322            .iter()
323            .filter(|l| l.issue_type == issue_type)
324            .collect()
325    }
326
327    /// Get summary statistics.
328    pub fn summary(&self) -> QualityLabelSummary {
329        let counts = self.count_by_type();
330        QualityLabelSummary {
331            total_labels: self.labels.len(),
332            missing_values: *counts.get(&LabeledIssueType::MissingValue).unwrap_or(&0),
333            typos: *counts.get(&LabeledIssueType::Typo).unwrap_or(&0),
334            format_variations: *counts.get(&LabeledIssueType::FormatVariation).unwrap_or(&0),
335            duplicates: *counts.get(&LabeledIssueType::Duplicate).unwrap_or(&0),
336            encoding_issues: *counts.get(&LabeledIssueType::EncodingIssue).unwrap_or(&0),
337            unique_documents: self
338                .labels
339                .iter()
340                .map(|l| &l.document_id)
341                .collect::<std::collections::HashSet<_>>()
342                .len(),
343            unique_fields: self
344                .labels
345                .iter()
346                .map(|l| &l.field_name)
347                .collect::<std::collections::HashSet<_>>()
348                .len(),
349        }
350    }
351
352    /// Convert to CSV rows.
353    pub fn to_csv_rows(&self) -> Vec<Vec<String>> {
354        self.labels
355            .iter()
356            .map(|l| {
357                vec![
358                    l.issue_id.clone(),
359                    format!("{:?}", l.issue_type),
360                    l.subtype
361                        .as_ref()
362                        .map(|s| format!("{:?}", s))
363                        .unwrap_or_default(),
364                    l.document_id.clone(),
365                    l.field_name.clone(),
366                    l.original_value.clone().unwrap_or_default(),
367                    l.modified_value.clone().unwrap_or_default(),
368                    l.severity.to_string(),
369                    l.processor.clone(),
370                ]
371            })
372            .collect()
373    }
374
375    /// Get CSV header.
376    pub fn csv_header() -> Vec<&'static str> {
377        vec![
378            "issue_id",
379            "issue_type",
380            "subtype",
381            "document_id",
382            "field_name",
383            "original_value",
384            "modified_value",
385            "severity",
386            "processor",
387        ]
388    }
389}
390
391/// Summary statistics for quality labels.
392#[derive(Debug, Clone, Default, Serialize, Deserialize)]
393pub struct QualityLabelSummary {
394    /// Total number of labels
395    pub total_labels: usize,
396    /// Number of missing value issues
397    pub missing_values: usize,
398    /// Number of typo issues
399    pub typos: usize,
400    /// Number of format variation issues
401    pub format_variations: usize,
402    /// Number of duplicate issues
403    pub duplicates: usize,
404    /// Number of encoding issues
405    pub encoding_issues: usize,
406    /// Number of unique documents affected
407    pub unique_documents: usize,
408    /// Number of unique fields affected
409    pub unique_fields: usize,
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415
416    #[test]
417    fn test_label_creation() {
418        let label = QualityIssueLabel::new(
419            LabeledIssueType::Typo,
420            "doc-123",
421            "vendor_name",
422            "typo_processor",
423        )
424        .with_values("Acme Corp", "Acne Corp")
425        .with_subtype(QualityIssueSubtype::Substitution);
426
427        assert_eq!(label.issue_type, LabeledIssueType::Typo);
428        assert_eq!(label.document_id, "doc-123");
429        assert_eq!(label.field_name, "vendor_name");
430        assert_eq!(label.original_value, Some("Acme Corp".to_string()));
431        assert_eq!(label.modified_value, Some("Acne Corp".to_string()));
432    }
433
434    #[test]
435    fn test_label_helpers() {
436        let missing = QualityIssueLabel::missing_value("doc-1", "amount", "missing_processor");
437        assert_eq!(missing.issue_type, LabeledIssueType::MissingValue);
438
439        let typo = QualityIssueLabel::typo("doc-2", "name", "John", "Jphn", "typo_processor");
440        assert_eq!(typo.issue_type, LabeledIssueType::Typo);
441        assert_eq!(typo.original_value, Some("John".to_string()));
442
443        let duplicate = QualityIssueLabel::duplicate("doc-3", "doc-1", "dup_processor");
444        assert_eq!(duplicate.issue_type, LabeledIssueType::Duplicate);
445    }
446
447    #[test]
448    fn test_quality_labels_collection() {
449        let mut labels = QualityLabels::new();
450        labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
451        labels.add(QualityIssueLabel::typo(
452            "doc-1", "field2", "a", "b", "proc2",
453        ));
454        labels.add(QualityIssueLabel::typo(
455            "doc-2", "field1", "x", "y", "proc2",
456        ));
457
458        assert_eq!(labels.len(), 3);
459
460        let counts = labels.count_by_type();
461        assert_eq!(*counts.get(&LabeledIssueType::MissingValue).unwrap(), 1);
462        assert_eq!(*counts.get(&LabeledIssueType::Typo).unwrap(), 2);
463
464        let doc1_labels = labels.for_document("doc-1");
465        assert_eq!(doc1_labels.len(), 2);
466    }
467
468    #[test]
469    fn test_summary() {
470        let mut labels = QualityLabels::new();
471        labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
472        labels.add(QualityIssueLabel::typo(
473            "doc-1", "field2", "a", "b", "proc2",
474        ));
475        labels.add(QualityIssueLabel::format_variation(
476            "doc-2",
477            "date",
478            "2024-01-01",
479            "01/01/2024",
480            "proc3",
481        ));
482
483        let summary = labels.summary();
484        assert_eq!(summary.total_labels, 3);
485        assert_eq!(summary.missing_values, 1);
486        assert_eq!(summary.typos, 1);
487        assert_eq!(summary.format_variations, 1);
488        assert_eq!(summary.unique_documents, 2);
489        assert_eq!(summary.unique_fields, 3);
490    }
491
492    #[test]
493    fn test_csv_export() {
494        let mut labels = QualityLabels::new();
495        labels.add(QualityIssueLabel::typo(
496            "doc-1",
497            "name",
498            "Test",
499            "Tset",
500            "typo_proc",
501        ));
502
503        let header = QualityLabels::csv_header();
504        assert_eq!(header.len(), 9);
505
506        let rows = labels.to_csv_rows();
507        assert_eq!(rows.len(), 1);
508        assert_eq!(rows[0].len(), 9);
509    }
510}