floe_core/checks/
unique.rs

1use std::collections::HashSet;
2
3use polars::prelude::{AnyValue, DataFrame};
4
5use crate::{config, ConfigError, FloeResult};
6use super::RowError;
7
8pub fn unique_errors(
9    df: &DataFrame,
10    columns: &[config::ColumnConfig],
11) -> FloeResult<Vec<Vec<RowError>>> {
12    let mut errors_per_row = vec![Vec::new(); df.height()];
13    let unique_columns: Vec<&config::ColumnConfig> = columns
14        .iter()
15        .filter(|col| col.unique == Some(true))
16        .collect();
17    if unique_columns.is_empty() {
18        return Ok(errors_per_row);
19    }
20
21    for column in unique_columns {
22        let series = df
23            .column(&column.name)
24            .map_err(|err| {
25                Box::new(ConfigError(format!(
26                    "unique column {} not found: {err}",
27                    column.name
28                )))
29            })?;
30        let mut seen = HashSet::new();
31        for row_idx in 0..df.height() {
32            let value = series.get(row_idx).map_err(|err| {
33                Box::new(ConfigError(format!(
34                    "unique column {} read failed: {err}",
35                    column.name
36                )))
37            })?;
38            if matches!(value, AnyValue::Null) {
39                continue;
40            }
41            let key = value.to_string();
42            if !seen.insert(key) {
43                errors_per_row[row_idx].push(RowError::new(
44                    "unique",
45                    &column.name,
46                    "duplicate value",
47                ));
48            }
49        }
50    }
51
52    Ok(errors_per_row)
53}