import polars as pl
import pytest
from polars.testing import assert_frame_equal
import ambers as am
def _meta_with_labels(labels: dict) -> am.SpssMetadata:
return am.SpssMetadata(variable_value_labels=labels)
class TestUnlabeledValues:
def test_all_labeled_no_errors(self):
df = pl.DataFrame({"Q1": [1.0, 2.0, 3.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B", 3: "C"}})
report = am.validate(df, meta)
assert report.is_valid
assert len(report.errors) == 0
def test_unlabeled_values_detected(self):
df = pl.DataFrame({"Q1": [1.0, 2.0, 3.0, 99.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B"}})
report = am.validate(df, meta)
assert not report.is_valid
assert len(report.errors) == 1
err = report.errors[0]
assert err.column == "Q1"
assert err.check == "unlabeled_values"
assert 3.0 in err.details["unlabeled_values"]
assert 99.0 in err.details["unlabeled_values"]
assert err.details["unique_in_data"] == 4
assert err.details["labeled_in_data"] == 2
def test_column_without_value_labels_skipped(self):
df = pl.DataFrame({"age": [25.0, 30.0, 35.0]})
meta = am.SpssMetadata() report = am.validate(df, meta)
assert report.is_valid
def test_string_column_skipped(self):
df = pl.DataFrame({"city": ["NYC", "LA", "CHI"]})
meta = _meta_with_labels({"city": {"NYC": "New York", "LA": "Los Angeles"}})
report = am.validate(df, meta)
assert report.is_valid
def test_all_null_no_error(self):
df = pl.DataFrame({"Q1": pl.Series([None, None, None], dtype=pl.Float64)})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B"}})
report = am.validate(df, meta)
assert report.is_valid
def test_multiple_columns_different_issues(self):
df = pl.DataFrame({
"Q1": [1.0, 2.0, 99.0],
"Q2": [1.0, 2.0, 3.0],
"Q3": [1.0, 2.0, 98.0],
})
meta = _meta_with_labels({
"Q1": {1: "A", 2: "B"}, "Q2": {1: "X", 2: "Y", 3: "Z"}, "Q3": {1: "P", 2: "Q"}, })
report = am.validate(df, meta)
assert not report.is_valid
assert len(report.errors) == 2
error_cols = {e.column for e in report.errors}
assert error_cols == {"Q1", "Q3"}
def test_labels_in_meta_not_in_data_is_fine(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B", 3: "C", 4: "D", 5: "E"}})
report = am.validate(df, meta)
assert report.is_valid
class TestDuplicateLabels:
def test_no_duplicates(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B"}})
report = am.validate(df, meta)
assert len(report.warnings) == 0
def test_duplicate_detected(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "Male", 2: "Female", 9: "Male"}})
report = am.validate(df, meta)
assert report.is_valid assert len(report.warnings) == 1
warn = report.warnings[0]
assert warn.column == "Q1"
assert warn.check == "duplicate_labels"
assert "Male" in warn.details["duplicates"]
def test_multiple_duplicates(self):
df = pl.DataFrame({"Q1": [1.0]})
meta = _meta_with_labels({
"Q1": {1: "Yes", 2: "No", 3: "Yes", 4: "No"},
})
report = am.validate(df, meta)
assert len(report.warnings) == 1
dupes = report.warnings[0].details["duplicates"]
assert "Yes" in dupes
assert "No" in dupes
class TestFiltering:
def test_columns_specific(self):
df = pl.DataFrame({"Q1": [1.0, 99.0], "Q2": [1.0, 98.0]})
meta = _meta_with_labels({
"Q1": {1: "A"},
"Q2": {1: "X"},
})
report = am.validate(df, meta, columns=["Q1"])
assert len(report.errors) == 1
assert report.errors[0].column == "Q1"
def test_exclude(self):
df = pl.DataFrame({"Q1": [1.0, 99.0], "Q2": [1.0, 98.0]})
meta = _meta_with_labels({
"Q1": {1: "A"},
"Q2": {1: "X"},
})
report = am.validate(df, meta, exclude=["Q1"])
assert len(report.errors) == 1
assert report.errors[0].column == "Q2"
def test_columns_and_exclude_combined(self):
df = pl.DataFrame({
"Q1": [1.0, 99.0],
"Q2": [1.0, 98.0],
"Q3": [1.0, 97.0],
})
meta = _meta_with_labels({
"Q1": {1: "A"},
"Q2": {1: "X"},
"Q3": {1: "P"},
})
report = am.validate(df, meta, columns=["Q1", "Q2", "Q3"], exclude=["Q2"])
error_cols = {e.column for e in report.errors}
assert "Q2" not in error_cols
assert "Q1" in error_cols
assert "Q3" in error_cols
class TestReturnTypes:
def test_dataframe_input(self):
df = pl.DataFrame({"Q1": [1.0, 99.0]})
meta = _meta_with_labels({"Q1": {1: "A"}})
report = am.validate(df, meta)
assert isinstance(report, am.ValidationReport)
def test_lazyframe_input(self):
lf = pl.DataFrame({"Q1": [1.0, 99.0]}).lazy()
meta = _meta_with_labels({"Q1": {1: "A"}})
report = am.validate(lf, meta)
assert isinstance(report, am.ValidationReport)
assert not report.is_valid
def test_is_valid_true_no_errors(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B"}})
report = am.validate(df, meta)
assert report.is_valid
def test_is_valid_true_warnings_only(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B", 9: "A"}})
report = am.validate(df, meta)
assert report.is_valid assert len(report.warnings) == 1
def test_is_valid_false_with_errors(self):
df = pl.DataFrame({"Q1": [1.0, 99.0]})
meta = _meta_with_labels({"Q1": {1: "A"}})
report = am.validate(df, meta)
assert not report.is_valid
def test_raise_if_invalid_raises_on_errors(self):
df = pl.DataFrame({"Q1": [1.0, 99.0]})
meta = _meta_with_labels({"Q1": {1: "A"}})
report = am.validate(df, meta)
with pytest.raises(ValueError, match="validate"):
report.raise_if_invalid()
def test_raise_if_invalid_silent_on_warnings(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B", 9: "A"}})
report = am.validate(df, meta)
report.raise_if_invalid()
def test_to_frame(self):
df = pl.DataFrame({"Q1": [1.0, 99.0]})
meta = _meta_with_labels({"Q1": {1: "A", 9: "A"}})
report = am.validate(df, meta)
frame = report.to_frame()
assert isinstance(frame, pl.DataFrame)
assert set(frame.columns) == {"severity", "column", "check", "message"}
assert frame.height == 2
def test_empty_report(self):
df = pl.DataFrame({"Q1": [1.0, 2.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B"}})
report = am.validate(df, meta)
assert report.is_valid
assert len(report.issues) == 0
assert report.to_frame().height == 0
def test_empty_df(self):
df = pl.DataFrame({"Q1": pl.Series([], dtype=pl.Float64)})
meta = _meta_with_labels({"Q1": {1: "A"}})
report = am.validate(df, meta)
assert report.is_valid
def test_invalid_df_type_raises(self):
with pytest.raises(TypeError):
am.validate([1, 2, 3], am.SpssMetadata())
class TestReprTruncation:
def test_many_issues_truncated(self):
df = pl.DataFrame({f"Q{i}": [1.0, 2.0, 99.0] for i in range(30)})
meta = _meta_with_labels({
f"Q{i}": {1: "Yes", 2: "No"} for i in range(30)
})
report = am.validate(df, meta)
repr_str = repr(report)
assert repr_str.count("[x] Q") == 10
assert "and 20 more errors" in repr_str
assert "report.to_frame()" in repr_str or "report.errors" in repr_str
def test_long_duplicate_message_truncated(self):
labels = {i: f"Label{i % 3}" for i in range(30)} df = pl.DataFrame({"Q1": [1.0]})
meta = _meta_with_labels({"Q1": labels})
report = am.validate(df, meta)
repr_str = repr(report)
for line in repr_str.split("\n"):
assert len(line) <= 80, f"Line too long ({len(line)}): {line}"
def test_box_width_capped(self):
labels = {i: f"Very Long Label Name For Code {i}" for i in range(20)}
df = pl.DataFrame({"Q1": list(range(20))})
meta = _meta_with_labels({"Q1": labels})
report = am.validate(df, meta)
for line in repr(report).split("\n"):
assert len(line) <= 80, f"Line too long ({len(line)}): {line}"
def test_to_frame_has_full_messages(self):
labels = {}
for i in range(30):
labels[float(i)] = f"Label{i % 3}"
df = pl.DataFrame({"Q1": [float(i) for i in range(30)]})
meta = _meta_with_labels({"Q1": labels})
report = am.validate(df, meta)
frame = report.to_frame()
assert frame.height > 0
class TestSharedHelpers:
def test_validate_and_apply_labels_flag_same_values(self):
df = pl.DataFrame({"Q1": [1.0, 2.0, 99.0]})
meta = _meta_with_labels({"Q1": {1: "A", 2: "B"}})
report = am.validate(df, meta)
assert not report.is_valid
assert 99.0 in report.errors[0].details["unlabeled_values"]
with pytest.raises(ValueError):
am.apply_labels(df, meta, output="enum")