import pytest
from sparkless.testing import get_imports
imports = get_imports()
SparkSession = imports.SparkSession
StringType = imports.StringType
IntegerType = imports.IntegerType
StructType = imports.StructType
StructField = imports.StructField
F = imports.F
class TestFillnaSubset:
@pytest.fixture
def sample_df(self, spark):
data = [
{"key": "A", "value": "1"},
{"key": None, "value": "2"},
{"key": "C", "value": None},
]
return spark.createDataFrame(data)
def test_fillna_subset_string_single_column(self, sample_df):
result = sample_df.fillna("", subset="value")
rows = result.collect()
assert rows[0]["key"] == "A"
assert rows[0]["value"] == "1" assert rows[1]["key"] is None assert rows[1]["value"] == "2" assert rows[2]["key"] == "C"
assert rows[2]["value"] == ""
def test_fillna_subset_list_multiple_columns(self, sample_df):
result = sample_df.fillna("", subset=["key", "value"])
rows = result.collect()
assert rows[0]["key"] == "A" assert rows[0]["value"] == "1" assert rows[1]["key"] == "" assert rows[1]["value"] == "2" assert rows[2]["key"] == "C" assert rows[2]["value"] == ""
def test_fillna_subset_tuple_multiple_columns(self, sample_df):
result = sample_df.fillna("", subset=("key", "value"))
rows = result.collect()
assert rows[0]["key"] == "A"
assert rows[0]["value"] == "1"
assert rows[1]["key"] == ""
assert rows[1]["value"] == "2"
assert rows[2]["key"] == "C"
assert rows[2]["value"] == ""
def test_fillna_subset_only_specified_columns_filled(self, spark):
data = [
{
"col1": None,
"col2": "B",
"col3": None,
}, {
"col1": "A",
"col2": None,
"col3": "C",
}, ]
df = spark.createDataFrame(data)
result = df.fillna("FILLED", subset=["col1", "col3"])
rows = result.collect()
assert rows[0]["col1"] == "FILLED" assert rows[0]["col2"] == "B" assert rows[0]["col3"] == "FILLED" assert rows[1]["col1"] == "A" assert rows[1]["col2"] is None assert rows[1]["col3"] == "C"
def test_fillna_subset_other_columns_unchanged(self, sample_df):
result = sample_df.fillna("FILLED", subset="value")
rows = result.collect()
assert rows[0]["key"] == "A"
assert rows[1]["key"] is None assert rows[2]["key"] == "C"
def test_fillna_dict_value_ignores_subset(self, sample_df):
result = sample_df.fillna(
{"key": "DEFAULT_KEY", "value": "DEFAULT_VALUE"}, subset="value"
)
rows = result.collect()
assert rows[0]["key"] == "A" assert rows[0]["value"] == "1" assert rows[1]["key"] == "DEFAULT_KEY" assert rows[1]["value"] == "2" assert rows[2]["key"] == "C" assert rows[2]["value"] == "DEFAULT_VALUE"
def test_fillna_subset_nonexistent_column_raises_error(self, sample_df):
with pytest.raises(Exception, match="nonexistent"):
sample_df.fillna("", subset="nonexistent")
def test_fillna_subset_multiple_nonexistent_columns_raises_error(self, sample_df):
with pytest.raises(Exception):
sample_df.fillna("", subset=["key", "nonexistent"])
def test_fillna_subset_empty_list(self, sample_df):
result = sample_df.fillna("FILLED", subset=[])
rows = result.collect()
assert rows[0]["key"] == "A"
assert rows[0]["value"] == "1"
assert rows[1]["key"] is None assert rows[1]["value"] == "2"
assert rows[2]["key"] == "C"
assert rows[2]["value"] is None
def test_fillna_subset_all_columns(self, sample_df):
result = sample_df.fillna("FILLED", subset=["key", "value"])
rows = result.collect()
assert rows[0]["key"] == "A" assert rows[0]["value"] == "1" assert rows[1]["key"] == "FILLED" assert rows[1]["value"] == "2" assert rows[2]["key"] == "C" assert rows[2]["value"] == "FILLED"
def test_fillna_no_subset_backward_compatibility(self, sample_df):
result = sample_df.fillna("FILLED")
rows = result.collect()
assert rows[0]["key"] == "A" assert rows[0]["value"] == "1" assert rows[1]["key"] == "FILLED" assert rows[1]["value"] == "2" assert rows[2]["key"] == "C" assert rows[2]["value"] == "FILLED"
def test_fillna_subset_issue_234_example(self, spark):
df = spark.createDataFrame(
[
{"key": "A", "value": "1"},
{"key": None, "value": "2"},
{"key": "C", "value": None},
]
)
result = df.fillna("", subset=["value"])
rows = result.collect()
assert rows[0]["key"] == "A"
assert rows[0]["value"] == "1"
assert rows[1]["key"] is None assert rows[1]["value"] == "2"
assert rows[2]["key"] == "C"
assert rows[2]["value"] == ""
def test_fillna_subset_issue_234_string_variant(self, spark):
df = spark.createDataFrame(
[
{"key": "A", "value": "1"},
{"key": None, "value": "2"},
{"key": "C", "value": None},
]
)
result = df.fillna("", subset="value")
rows = result.collect()
assert rows[0]["key"] == "A"
assert rows[0]["value"] == "1"
assert rows[1]["key"] is None assert rows[1]["value"] == "2"
assert rows[2]["key"] == "C"
assert rows[2]["value"] == ""
def test_fillna_subset_numeric_value(self, spark):
data = [
{"col1": None, "col2": 10, "col3": None},
{"col1": 5, "col2": None, "col3": 20},
]
df = spark.createDataFrame(data)
result = df.fillna(0, subset=["col1", "col3"])
rows = result.collect()
assert rows[0]["col1"] == 0 assert rows[0]["col2"] == 10 assert rows[0]["col3"] == 0 assert rows[1]["col1"] == 5 assert rows[1]["col2"] is None assert rows[1]["col3"] == 20
def test_fillna_subset_boolean_values(self, spark):
data = [
{"name": "Alice", "active": None, "verified": True},
{"name": "Bob", "active": False, "verified": None},
{"name": "Charlie", "active": None, "verified": None},
]
df = spark.createDataFrame(data)
result = df.fillna(False, subset=["active"])
rows = result.collect()
assert rows[0]["name"] == "Alice"
assert rows[0]["active"] is False assert rows[0]["verified"] is True assert rows[1]["name"] == "Bob"
assert rows[1]["active"] is False assert rows[1]["verified"] is None assert rows[2]["name"] == "Charlie"
assert rows[2]["active"] is False assert rows[2]["verified"] is None
def test_fillna_subset_float_values(self, spark):
data = [
{"id": 1, "price": None, "discount": 0.1},
{"id": 2, "price": 99.99, "discount": None},
{"id": 3, "price": None, "discount": None},
]
df = spark.createDataFrame(data)
result = df.fillna(0.0, subset=["price"])
rows = result.collect()
assert rows[0]["id"] == 1
assert rows[0]["price"] == 0.0 assert rows[0]["discount"] == 0.1 assert rows[1]["id"] == 2
assert rows[1]["price"] == 99.99 assert rows[1]["discount"] is None assert rows[2]["id"] == 3
assert rows[2]["price"] == 0.0 assert rows[2]["discount"] is None
def test_fillna_subset_multiple_nulls_same_column(self, spark):
data = [
{"col1": "A", "col2": None},
{"col1": "B", "col2": None},
{"col1": "C", "col2": None},
{"col1": "D", "col2": "X"},
]
df = spark.createDataFrame(data)
result = df.fillna("FILLED", subset=["col2"])
rows = result.collect()
assert rows[0]["col1"] == "A"
assert rows[0]["col2"] == "FILLED" assert rows[1]["col1"] == "B"
assert rows[1]["col2"] == "FILLED" assert rows[2]["col1"] == "C"
assert rows[2]["col2"] == "FILLED" assert rows[3]["col1"] == "D"
assert rows[3]["col2"] == "X"
def test_fillna_subset_all_nulls_in_column(self, spark):
schema = StructType(
[
StructField("col1", StringType()),
StructField("col2", StringType()),
]
)
data = [
{"col1": "A", "col2": None},
{"col1": "B", "col2": None},
{"col1": "C", "col2": None},
]
df = spark.createDataFrame(data, schema)
result = df.fillna("ALL_NULL", subset=["col2"])
rows = result.collect()
assert rows[0]["col1"] == "A"
assert rows[0]["col2"] == "ALL_NULL"
assert rows[1]["col1"] == "B"
assert rows[1]["col2"] == "ALL_NULL"
assert rows[2]["col1"] == "C"
assert rows[2]["col2"] == "ALL_NULL"
def test_fillna_subset_no_nulls_in_subset_columns(self, spark):
schema = StructType(
[
StructField("col1", StringType()),
StructField("col2", StringType()),
StructField("col3", StringType()),
]
)
data = [
{"col1": "A", "col2": "X", "col3": None},
{"col1": "B", "col2": "Y", "col3": None},
]
df = spark.createDataFrame(data, schema)
result = df.fillna("FILLED", subset=["col1", "col2"])
rows = result.collect()
assert rows[0]["col1"] == "A" assert rows[0]["col2"] == "X" assert rows[0]["col3"] is None assert rows[1]["col1"] == "B" assert rows[1]["col2"] == "Y" assert rows[1]["col3"] is None
def test_fillna_subset_mixed_data_types(self, spark):
data = [
{"name": None, "age": 25, "score": None, "active": True},
{"name": "Bob", "age": None, "score": 85.5, "active": None},
]
df = spark.createDataFrame(data)
result = df.fillna("UNKNOWN", subset=["name"])
rows = result.collect()
assert rows[0]["name"] == "UNKNOWN" assert rows[0]["age"] == 25 assert rows[0]["score"] is None assert rows[0]["active"] is True assert rows[1]["name"] == "Bob" assert rows[1]["age"] is None assert rows[1]["score"] == 85.5 assert rows[1]["active"] is None
def test_fillna_subset_empty_dataframe(self, spark):
schema = StructType(
[
StructField("name", StringType()),
StructField("age", IntegerType()),
]
)
df = spark.createDataFrame([], schema)
result = df.fillna("UNKNOWN", subset=["name"])
rows = result.collect()
assert len(rows) == 0
def test_fillna_subset_single_row(self, spark):
schema = StructType(
[
StructField("col1", StringType()),
StructField("col2", StringType()),
StructField("col3", StringType()),
]
)
data = [{"col1": None, "col2": "X", "col3": None}]
df = spark.createDataFrame(data, schema)
result = df.fillna("FILLED", subset=["col1", "col3"])
rows = result.collect()
assert len(rows) == 1
assert rows[0]["col1"] == "FILLED" assert rows[0]["col2"] == "X" assert rows[0]["col3"] == "FILLED"
def test_fillna_subset_chained_operations(self, spark):
data = [
{"name": None, "age": 25, "city": None},
{"name": "Bob", "age": None, "city": "NYC"},
]
df = spark.createDataFrame(data)
result = df.fillna("UNKNOWN", subset=["name"]).fillna("N/A", subset=["city"])
rows = result.collect()
assert rows[0]["name"] == "UNKNOWN" assert rows[0]["age"] == 25 assert rows[0]["city"] == "N/A" assert rows[1]["name"] == "Bob" assert rows[1]["age"] is None assert rows[1]["city"] == "NYC"
def test_fillna_subset_unicode_and_special_characters(self, spark):
schema = StructType(
[
StructField("name", StringType()),
StructField("comment", StringType()),
]
)
data = [
{"name": "Alice", "comment": None},
{"name": "Bob", "comment": None},
]
df = spark.createDataFrame(data, schema)
result = df.fillna("🚀 Unicode: 测试 🎉", subset=["comment"])
rows = result.collect()
assert rows[0]["name"] == "Alice"
assert rows[0]["comment"] == "🚀 Unicode: 测试 🎉" assert rows[1]["name"] == "Bob"
assert rows[1]["comment"] == "🚀 Unicode: 测试 🎉"
def test_fillna_subset_large_dataset(self, spark):
data = [{"id": i, "value": None if i % 2 == 0 else i} for i in range(100)]
df = spark.createDataFrame(data)
result = df.fillna(-1, subset=["value"])
rows = result.collect()
assert len(rows) == 100
for i, row in enumerate(rows):
assert row["id"] == i
if i % 2 == 0:
assert row["value"] == -1 else:
assert row["value"] == i
def test_fillna_subset_single_column_all_rows(self, spark):
schema = StructType(
[
StructField("col1", StringType()),
StructField("col2", StringType()),
]
)
data = [
{"col1": None, "col2": "A"},
{"col1": None, "col2": "B"},
{"col1": None, "col2": "C"},
]
df = spark.createDataFrame(data, schema)
result = df.fillna("FILLED", subset=["col1"])
rows = result.collect()
for row in rows:
assert row["col1"] == "FILLED" assert rows[0]["col2"] == "A"
assert rows[1]["col2"] == "B"
assert rows[2]["col2"] == "C"
def test_fillna_subset_zero_value(self, spark):
data = [
{"id": 1, "count": None, "total": 100},
{"id": 2, "count": 5, "total": None},
]
df = spark.createDataFrame(data)
result = df.fillna(0, subset=["count"])
rows = result.collect()
assert rows[0]["id"] == 1
assert rows[0]["count"] == 0 assert rows[0]["total"] == 100 assert rows[1]["id"] == 2
assert rows[1]["count"] == 5 assert rows[1]["total"] is None
def test_fillna_subset_negative_value(self, spark):
data = [
{"id": 1, "balance": None, "debt": 100},
{"id": 2, "balance": 50, "debt": None},
]
df = spark.createDataFrame(data)
result = df.fillna(-999, subset=["balance"])
rows = result.collect()
assert rows[0]["id"] == 1
assert rows[0]["balance"] == -999 assert rows[0]["debt"] == 100 assert rows[1]["id"] == 2
assert rows[1]["balance"] == 50 assert rows[1]["debt"] is None
def test_fillna_subset_empty_string(self, spark):
schema = StructType(
[
StructField("name", StringType()),
StructField("email", StringType()),
]
)
data = [
{"name": "Alice", "email": None},
{"name": "Bob", "email": None},
]
df = spark.createDataFrame(data, schema)
result = df.fillna("", subset=["email"])
rows = result.collect()
assert rows[0]["name"] == "Alice"
assert rows[0]["email"] == "" assert rows[1]["name"] == "Bob"
assert rows[1]["email"] == ""
def test_fillna_subset_whitespace_string(self, spark):
schema = StructType(
[
StructField("name", StringType()),
StructField("notes", StringType()),
]
)
data = [
{"name": "Alice", "notes": None},
{"name": "Bob", "notes": None},
]
df = spark.createDataFrame(data, schema)
result = df.fillna(" ", subset=["notes"])
rows = result.collect()
assert rows[0]["name"] == "Alice"
assert rows[0]["notes"] == " " assert rows[1]["name"] == "Bob"
assert rows[1]["notes"] == " "
def test_fillna_subset_very_long_string(self, spark):
schema = StructType(
[
StructField("id", IntegerType()),
StructField("description", StringType()),
]
)
long_string = "X" * 1000
data = [
{"id": 1, "description": None},
{"id": 2, "description": None},
]
df = spark.createDataFrame(data, schema)
result = df.fillna(long_string, subset=["description"])
rows = result.collect()
assert rows[0]["id"] == 1
assert rows[0]["description"] == long_string
assert rows[1]["id"] == 2
assert rows[1]["description"] == long_string
def test_fillna_subset_partial_column_fill(self, spark):
data = [
{"id": 1, "status": "active", "priority": None},
{"id": 2, "status": None, "priority": "high"},
{"id": 3, "status": "inactive", "priority": None},
{"id": 4, "status": None, "priority": "low"},
]
df = spark.createDataFrame(data)
result = df.fillna("default", subset=["status"])
rows = result.collect()
assert rows[0]["status"] == "active" assert rows[0]["priority"] is None assert rows[1]["status"] == "default" assert rows[1]["priority"] == "high" assert rows[2]["status"] == "inactive" assert rows[2]["priority"] is None assert rows[3]["status"] == "default" assert rows[3]["priority"] == "low"
def test_fillna_subset_with_filter_operation(self, spark):
data = [
{"id": 1, "name": None, "score": 85},
{"id": 2, "name": "Bob", "score": 90},
{"id": 3, "name": None, "score": 75},
]
df = spark.createDataFrame(data)
result = df.fillna("UNKNOWN", subset=["name"]).filter(F.col("score") > 80)
rows = result.collect()
assert len(rows) == 2
assert rows[0]["id"] == 1
assert rows[0]["name"] == "UNKNOWN" assert rows[0]["score"] == 85
assert rows[1]["id"] == 2
assert rows[1]["name"] == "Bob" assert rows[1]["score"] == 90
def test_fillna_subset_with_select_operation(self, spark):
schema = StructType(
[
StructField("col1", StringType()),
StructField("col2", StringType()),
StructField("col3", StringType()),
]
)
data = [
{"col1": None, "col2": "A", "col3": None},
{"col1": "B", "col2": "C", "col3": None},
]
df = spark.createDataFrame(data, schema)
result = df.select("col1", "col2").fillna("FILLED", subset=["col1"])
rows = result.collect()
assert rows[0]["col1"] == "FILLED" assert rows[0]["col2"] == "A"
assert rows[1]["col1"] == "B" assert rows[1]["col2"] == "C"
def test_fillna_subset_preserves_data_types(self, spark):
data = [
{"id": 1, "name": None, "age": 25, "active": True},
{"id": 2, "name": "Bob", "age": None, "active": False},
]
df = spark.createDataFrame(data)
result = df.fillna("UNKNOWN", subset=["name"])
rows = result.collect()
assert isinstance(rows[0]["id"], (int, type(None)))
assert isinstance(rows[0]["name"], str) assert isinstance(rows[0]["age"], (int, type(None)))
assert isinstance(rows[0]["active"], bool)
assert isinstance(rows[1]["id"], (int, type(None)))
assert isinstance(rows[1]["name"], str) assert isinstance(rows[1]["age"], (int, type(None)))
assert isinstance(rows[1]["active"], bool)
def test_fillna_subset_type_mismatch_int_column_string_fill(self, spark):
data = [
{"id": 1, "value": None},
{"id": 2, "value": 5},
{"id": 3, "value": None},
]
df = spark.createDataFrame(data)
result = df.fillna("", subset=["value"])
rows = result.collect()
assert rows[0]["value"] is None assert rows[1]["value"] == 5 assert rows[2]["value"] is None
def test_fillna_subset_type_mismatch_string_column_int_fill(self, spark):
data = [
{"id": 1, "name": None},
{"id": 2, "name": "Bob"},
{"id": 3, "name": None},
]
df = spark.createDataFrame(data)
result = df.fillna(999, subset=["name"])
rows = result.collect()
assert rows[0]["name"] is None assert rows[1]["name"] == "Bob" assert rows[2]["name"] is None
def test_fillna_subset_type_mismatch_float_column_string_fill(self, spark):
data = [
{"id": 1, "price": None},
{"id": 2, "price": 99.99},
{"id": 3, "price": None},
]
df = spark.createDataFrame(data)
result = df.fillna("FREE", subset=["price"])
rows = result.collect()
assert rows[0]["price"] is None assert rows[1]["price"] == 99.99 assert rows[2]["price"] is None
def test_fillna_subset_type_mismatch_boolean_column_string_fill(self, spark):
data = [
{"id": 1, "active": None},
{"id": 2, "active": True},
{"id": 3, "active": None},
]
df = spark.createDataFrame(data)
result = df.fillna("YES", subset=["active"])
rows = result.collect()
assert rows[0]["active"] is None assert rows[1]["active"] is True assert rows[2]["active"] is None
def test_fillna_subset_type_compatible_string_column_string_fill(self, spark):
data = [
{"id": 1, "name": None},
{"id": 2, "name": "Bob"},
{"id": 3, "name": None},
]
df = spark.createDataFrame(data)
result = df.fillna("UNKNOWN", subset=["name"])
rows = result.collect()
assert rows[0]["name"] == "UNKNOWN" assert rows[1]["name"] == "Bob" assert rows[2]["name"] == "UNKNOWN"
def test_fillna_subset_type_compatible_int_column_int_fill(self, spark):
data = [
{"id": 1, "value": None},
{"id": 2, "value": 5},
{"id": 3, "value": None},
]
df = spark.createDataFrame(data)
result = df.fillna(0, subset=["value"])
rows = result.collect()
assert rows[0]["value"] == 0 assert rows[1]["value"] == 5 assert rows[2]["value"] == 0
def test_fillna_subset_type_compatible_float_column_float_fill(self, spark):
data = [
{"id": 1, "price": None},
{"id": 2, "price": 99.99},
{"id": 3, "price": None},
]
df = spark.createDataFrame(data)
result = df.fillna(0.0, subset=["price"])
rows = result.collect()
assert rows[0]["price"] == 0.0 assert rows[1]["price"] == 99.99 assert rows[2]["price"] == 0.0
def test_fillna_subset_type_compatible_float_column_int_fill(self, spark):
data = [
{"id": 1, "price": None},
{"id": 2, "price": 99.99},
{"id": 3, "price": None},
]
df = spark.createDataFrame(data)
result = df.fillna(0, subset=["price"])
rows = result.collect()
assert rows[0]["price"] == 0 assert rows[1]["price"] == 99.99 assert rows[2]["price"] == 0
def test_fillna_subset_type_mismatch_dict_value(self, spark):
data = [
{"id": 1, "name": None, "value": None},
{"id": 2, "name": "Bob", "value": 5},
{"id": 3, "name": None, "value": None},
]
df = spark.createDataFrame(data)
result = df.fillna({"name": "UNKNOWN", "value": "INVALID"}, subset=["name"])
rows = result.collect()
assert rows[0]["name"] == "UNKNOWN" assert rows[1]["name"] == "Bob" assert rows[2]["name"] == "UNKNOWN" assert rows[0]["value"] is None assert rows[1]["value"] == 5 assert rows[2]["value"] is None