1use serde::{Deserialize, Serialize};
8use uuid::Uuid;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12#[serde(rename_all = "snake_case")]
13pub enum LabeledIssueType {
14 MissingValue,
16 Typo,
18 FormatVariation,
20 Duplicate,
22 EncodingIssue,
24 Inconsistency,
26 OutOfRange,
28 InvalidReference,
30}
31
32impl LabeledIssueType {
33 pub fn display_name(&self) -> &'static str {
35 match self {
36 LabeledIssueType::MissingValue => "Missing Value",
37 LabeledIssueType::Typo => "Typo",
38 LabeledIssueType::FormatVariation => "Format Variation",
39 LabeledIssueType::Duplicate => "Duplicate",
40 LabeledIssueType::EncodingIssue => "Encoding Issue",
41 LabeledIssueType::Inconsistency => "Inconsistency",
42 LabeledIssueType::OutOfRange => "Out of Range",
43 LabeledIssueType::InvalidReference => "Invalid Reference",
44 }
45 }
46
47 pub fn default_severity(&self) -> u8 {
49 match self {
50 LabeledIssueType::MissingValue => 3,
51 LabeledIssueType::Typo => 2,
52 LabeledIssueType::FormatVariation => 1,
53 LabeledIssueType::Duplicate => 4,
54 LabeledIssueType::EncodingIssue => 3,
55 LabeledIssueType::Inconsistency => 2,
56 LabeledIssueType::OutOfRange => 4,
57 LabeledIssueType::InvalidReference => 5,
58 }
59 }
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum QualityIssueSubtype {
66 NullValue,
68 EmptyString,
69 Placeholder,
70 SystematicMissing,
71
72 Substitution,
74 Transposition,
75 Insertion,
76 Deletion,
77 OcrError,
78 Homophone,
79
80 DateFormatVariation,
82 AmountFormatVariation,
83 IdentifierFormatVariation,
84 CaseVariation,
85
86 ExactDuplicate,
88 NearDuplicate,
89 FuzzyDuplicate,
90
91 Mojibake,
93 HtmlEntityCorruption,
94 BomIssue,
95 CharacterCorruption,
96
97 Other(String),
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct QualityIssueLabel {
104 pub issue_id: String,
106 pub issue_type: LabeledIssueType,
108 pub subtype: Option<QualityIssueSubtype>,
110 pub document_id: String,
112 pub field_name: String,
114 pub original_value: Option<String>,
116 pub modified_value: Option<String>,
118 pub severity: u8,
120 pub processor: String,
122 #[serde(default)]
124 pub metadata: std::collections::HashMap<String, String>,
125}
126
127impl QualityIssueLabel {
128 pub fn new(
130 issue_type: LabeledIssueType,
131 document_id: impl Into<String>,
132 field_name: impl Into<String>,
133 processor: impl Into<String>,
134 ) -> Self {
135 Self {
136 issue_id: Uuid::new_v4().to_string(),
137 issue_type,
138 subtype: None,
139 document_id: document_id.into(),
140 field_name: field_name.into(),
141 original_value: None,
142 modified_value: None,
143 severity: issue_type.default_severity(),
144 processor: processor.into(),
145 metadata: std::collections::HashMap::new(),
146 }
147 }
148
149 pub fn with_subtype(mut self, subtype: QualityIssueSubtype) -> Self {
151 self.subtype = Some(subtype);
152 self
153 }
154
155 pub fn with_original(mut self, value: impl Into<String>) -> Self {
157 self.original_value = Some(value.into());
158 self
159 }
160
161 pub fn with_modified(mut self, value: impl Into<String>) -> Self {
163 self.modified_value = Some(value.into());
164 self
165 }
166
167 pub fn with_values(mut self, original: impl Into<String>, modified: impl Into<String>) -> Self {
169 self.original_value = Some(original.into());
170 self.modified_value = Some(modified.into());
171 self
172 }
173
174 pub fn with_severity(mut self, severity: u8) -> Self {
176 self.severity = severity.clamp(1, 5);
177 self
178 }
179
180 pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
182 self.metadata.insert(key.into(), value.into());
183 self
184 }
185
186 pub fn missing_value(
188 document_id: impl Into<String>,
189 field_name: impl Into<String>,
190 processor: impl Into<String>,
191 ) -> Self {
192 Self::new(
193 LabeledIssueType::MissingValue,
194 document_id,
195 field_name,
196 processor,
197 )
198 }
199
200 pub fn typo(
202 document_id: impl Into<String>,
203 field_name: impl Into<String>,
204 original: impl Into<String>,
205 modified: impl Into<String>,
206 processor: impl Into<String>,
207 ) -> Self {
208 Self::new(LabeledIssueType::Typo, document_id, field_name, processor)
209 .with_values(original, modified)
210 }
211
212 pub fn format_variation(
214 document_id: impl Into<String>,
215 field_name: impl Into<String>,
216 original: impl Into<String>,
217 modified: impl Into<String>,
218 processor: impl Into<String>,
219 ) -> Self {
220 Self::new(
221 LabeledIssueType::FormatVariation,
222 document_id,
223 field_name,
224 processor,
225 )
226 .with_values(original, modified)
227 }
228
229 pub fn duplicate(
231 document_id: impl Into<String>,
232 original_doc_id: impl Into<String>,
233 processor: impl Into<String>,
234 ) -> Self {
235 Self::new(
236 LabeledIssueType::Duplicate,
237 document_id,
238 "_record",
239 processor,
240 )
241 .with_metadata("original_document_id", original_doc_id)
242 }
243}
244
245#[derive(Debug, Clone, Default, Serialize, Deserialize)]
247pub struct QualityLabels {
248 pub labels: Vec<QualityIssueLabel>,
250}
251
252impl QualityLabels {
253 pub fn new() -> Self {
255 Self { labels: Vec::new() }
256 }
257
258 pub fn with_capacity(capacity: usize) -> Self {
260 Self {
261 labels: Vec::with_capacity(capacity),
262 }
263 }
264
265 pub fn add(&mut self, label: QualityIssueLabel) {
267 self.labels.push(label);
268 }
269
270 pub fn extend(&mut self, labels: impl IntoIterator<Item = QualityIssueLabel>) {
272 self.labels.extend(labels);
273 }
274
275 pub fn len(&self) -> usize {
277 self.labels.len()
278 }
279
280 pub fn is_empty(&self) -> bool {
282 self.labels.is_empty()
283 }
284
285 pub fn count_by_type(&self) -> std::collections::HashMap<LabeledIssueType, usize> {
287 let mut counts = std::collections::HashMap::new();
288 for label in &self.labels {
289 *counts.entry(label.issue_type).or_insert(0) += 1;
290 }
291 counts
292 }
293
294 pub fn count_by_processor(&self) -> std::collections::HashMap<String, usize> {
296 let mut counts = std::collections::HashMap::new();
297 for label in &self.labels {
298 *counts.entry(label.processor.clone()).or_insert(0) += 1;
299 }
300 counts
301 }
302
303 pub fn for_document(&self, document_id: &str) -> Vec<&QualityIssueLabel> {
305 self.labels
306 .iter()
307 .filter(|l| l.document_id == document_id)
308 .collect()
309 }
310
311 pub fn for_field(&self, field_name: &str) -> Vec<&QualityIssueLabel> {
313 self.labels
314 .iter()
315 .filter(|l| l.field_name == field_name)
316 .collect()
317 }
318
319 pub fn of_type(&self, issue_type: LabeledIssueType) -> Vec<&QualityIssueLabel> {
321 self.labels
322 .iter()
323 .filter(|l| l.issue_type == issue_type)
324 .collect()
325 }
326
327 pub fn summary(&self) -> QualityLabelSummary {
329 let counts = self.count_by_type();
330 QualityLabelSummary {
331 total_labels: self.labels.len(),
332 missing_values: *counts.get(&LabeledIssueType::MissingValue).unwrap_or(&0),
333 typos: *counts.get(&LabeledIssueType::Typo).unwrap_or(&0),
334 format_variations: *counts.get(&LabeledIssueType::FormatVariation).unwrap_or(&0),
335 duplicates: *counts.get(&LabeledIssueType::Duplicate).unwrap_or(&0),
336 encoding_issues: *counts.get(&LabeledIssueType::EncodingIssue).unwrap_or(&0),
337 unique_documents: self
338 .labels
339 .iter()
340 .map(|l| &l.document_id)
341 .collect::<std::collections::HashSet<_>>()
342 .len(),
343 unique_fields: self
344 .labels
345 .iter()
346 .map(|l| &l.field_name)
347 .collect::<std::collections::HashSet<_>>()
348 .len(),
349 }
350 }
351
352 pub fn to_csv_rows(&self) -> Vec<Vec<String>> {
354 self.labels
355 .iter()
356 .map(|l| {
357 vec![
358 l.issue_id.clone(),
359 format!("{:?}", l.issue_type),
360 l.subtype
361 .as_ref()
362 .map(|s| format!("{:?}", s))
363 .unwrap_or_default(),
364 l.document_id.clone(),
365 l.field_name.clone(),
366 l.original_value.clone().unwrap_or_default(),
367 l.modified_value.clone().unwrap_or_default(),
368 l.severity.to_string(),
369 l.processor.clone(),
370 ]
371 })
372 .collect()
373 }
374
375 pub fn csv_header() -> Vec<&'static str> {
377 vec![
378 "issue_id",
379 "issue_type",
380 "subtype",
381 "document_id",
382 "field_name",
383 "original_value",
384 "modified_value",
385 "severity",
386 "processor",
387 ]
388 }
389}
390
391#[derive(Debug, Clone, Default, Serialize, Deserialize)]
393pub struct QualityLabelSummary {
394 pub total_labels: usize,
396 pub missing_values: usize,
398 pub typos: usize,
400 pub format_variations: usize,
402 pub duplicates: usize,
404 pub encoding_issues: usize,
406 pub unique_documents: usize,
408 pub unique_fields: usize,
410}
411
412#[cfg(test)]
413mod tests {
414 use super::*;
415
416 #[test]
417 fn test_label_creation() {
418 let label = QualityIssueLabel::new(
419 LabeledIssueType::Typo,
420 "doc-123",
421 "vendor_name",
422 "typo_processor",
423 )
424 .with_values("Acme Corp", "Acne Corp")
425 .with_subtype(QualityIssueSubtype::Substitution);
426
427 assert_eq!(label.issue_type, LabeledIssueType::Typo);
428 assert_eq!(label.document_id, "doc-123");
429 assert_eq!(label.field_name, "vendor_name");
430 assert_eq!(label.original_value, Some("Acme Corp".to_string()));
431 assert_eq!(label.modified_value, Some("Acne Corp".to_string()));
432 }
433
434 #[test]
435 fn test_label_helpers() {
436 let missing = QualityIssueLabel::missing_value("doc-1", "amount", "missing_processor");
437 assert_eq!(missing.issue_type, LabeledIssueType::MissingValue);
438
439 let typo = QualityIssueLabel::typo("doc-2", "name", "John", "Jphn", "typo_processor");
440 assert_eq!(typo.issue_type, LabeledIssueType::Typo);
441 assert_eq!(typo.original_value, Some("John".to_string()));
442
443 let duplicate = QualityIssueLabel::duplicate("doc-3", "doc-1", "dup_processor");
444 assert_eq!(duplicate.issue_type, LabeledIssueType::Duplicate);
445 }
446
447 #[test]
448 fn test_quality_labels_collection() {
449 let mut labels = QualityLabels::new();
450 labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
451 labels.add(QualityIssueLabel::typo(
452 "doc-1", "field2", "a", "b", "proc2",
453 ));
454 labels.add(QualityIssueLabel::typo(
455 "doc-2", "field1", "x", "y", "proc2",
456 ));
457
458 assert_eq!(labels.len(), 3);
459
460 let counts = labels.count_by_type();
461 assert_eq!(*counts.get(&LabeledIssueType::MissingValue).unwrap(), 1);
462 assert_eq!(*counts.get(&LabeledIssueType::Typo).unwrap(), 2);
463
464 let doc1_labels = labels.for_document("doc-1");
465 assert_eq!(doc1_labels.len(), 2);
466 }
467
468 #[test]
469 fn test_summary() {
470 let mut labels = QualityLabels::new();
471 labels.add(QualityIssueLabel::missing_value("doc-1", "field1", "proc1"));
472 labels.add(QualityIssueLabel::typo(
473 "doc-1", "field2", "a", "b", "proc2",
474 ));
475 labels.add(QualityIssueLabel::format_variation(
476 "doc-2",
477 "date",
478 "2024-01-01",
479 "01/01/2024",
480 "proc3",
481 ));
482
483 let summary = labels.summary();
484 assert_eq!(summary.total_labels, 3);
485 assert_eq!(summary.missing_values, 1);
486 assert_eq!(summary.typos, 1);
487 assert_eq!(summary.format_variations, 1);
488 assert_eq!(summary.unique_documents, 2);
489 assert_eq!(summary.unique_fields, 3);
490 }
491
492 #[test]
493 fn test_csv_export() {
494 let mut labels = QualityLabels::new();
495 labels.add(QualityIssueLabel::typo(
496 "doc-1",
497 "name",
498 "Test",
499 "Tset",
500 "typo_proc",
501 ));
502
503 let header = QualityLabels::csv_header();
504 assert_eq!(header.len(), 9);
505
506 let rows = labels.to_csv_rows();
507 assert_eq!(rows.len(), 1);
508 assert_eq!(rows[0].len(), 9);
509 }
510}