use polars::prelude::*;
pub fn replace_values_with_null(
dataframe: DataFrame,
null_value_list: &[&str],
apply_to_all_columns: bool,
) -> PolarsResult<DataFrame> {
if null_value_list.is_empty() {
return Ok(dataframe);
}
let list_series: Series = Series::build_from_list(null_value_list);
let replacement_expr: Expr = build_null_expression(list_series.lit(), apply_to_all_columns);
dataframe
.lazy()
.with_columns([replacement_expr]) .collect() }
pub trait SeriesExtension {
fn build_from_list(input_slice: &[&str]) -> Series;
}
impl SeriesExtension for Series {
fn build_from_list(input_slice: &[&str]) -> Series {
let inner_series = Series::new("inner_content".into(), input_slice);
let list_series: Series = Series::new(
"list_string".into(), vec![inner_series], );
list_series
}
}
pub fn build_null_expression(null_values_expr: Expr, apply_to_all_columns: bool) -> Expr {
let replacement_expr: Expr = if apply_to_all_columns {
let condition = all() .as_expr()
.cast(DataType::String) .str()
.strip_chars(lit(NULL)) .is_in(null_values_expr, true);
when(condition) .then(lit(NULL)) .otherwise(all()) .name()
.keep() } else {
let string_cols_selector = dtype_col(&DataType::String).as_selector().as_expr();
let condition = string_cols_selector .clone() .str()
.strip_chars(lit(NULL)) .is_in(null_values_expr, true);
when(condition) .then(lit(NULL).cast(DataType::String))
.otherwise(string_cols_selector)
.name()
.keep() };
replacement_expr
}
#[cfg(test)]
mod tests_replace_values_with_null {
use super::*; use polars::functions::concat_df_horizontal;
fn create_test_df() -> PolarsResult<DataFrame> {
df!(
"col_str" => &[
Some("Keep"), Some(" N/A "), Some("<N/D>"), Some(" "), Some("Value"),
None, Some("NA"), Some("999"), Some("3.45"), Some("false")
],
"col_int" => &[
Some(1), Some(2), Some(999), Some(4), Some(5),
Some(6), Some(7), Some(999), Some(0), Some(10)
],
"col_flt" => &[
Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5),
Some(6.6), Some(7.7), Some(8.8), Some(3.45), Some(10.1)
],
"col_bool" => &[
Some(true), Some(false), Some(true), Some(false), Some(true),
Some(true), Some(false), Some(true), Some(false), Some(true)
],
"col_str_ws" => &[
Some(" leading"), Some("trailing "), Some(" both "), Some(""), Some("NA"),
Some(" NA "),
None, Some("ok"), Some("999 "), Some(" 3.45")
]
)
}
const NULL_MARKERS: &[&str] = &["", "<N/D>", "NA", "N/A", "999", "3.45", "false"];
#[test]
fn test_universal_replacement_mixed_types() -> Result<(), PolarsError> {
let df_input = df![
"col_str" => &[Some("Keep"), Some(" NA "), Some("<N/D>"), Some(" "), None, Some("999"), Some("3.45"), Some("false"), Some("2024-01-15")],
"col_int" => &[Some(123i32), Some(999i32), Some(-10i32), Some(999i32), Some(200i32), Some(0i32), Some(999i32), Some(1i32), Some(2i32)], "col_float" => &[Some(1.1), Some(3.45), Some(-2.2), None, Some(999.0), Some(0.0), Some(123.456), Some(3.450), Some(5.0)], "col_bool" => &[Some(true), Some(false), None, Some(true), Some(false), Some(true), Some(true), Some(false), Some(true)], ]?;
let null_markers = &[
"", "NA", "<N/D>", "999", "3.45", "false", "2024-01-15",
];
let df_expected = df![
"col_str" => &[Some("Keep"), None, None, None, None, None, None, None, None],
"col_int" => &[Some(123i32), None, Some(-10i32), None, Some(200i32), Some(0i32), None, Some(1i32), Some(2i32)], "col_float" => &[Some(1.1), None, Some(-2.2), None, Some(999.0), Some(0.0), Some(123.456), None, Some(5.0)],
"col_bool" => &[Some(true), None, None, Some(true), None, Some(true), Some(true), None, Some(true)],
]?;
println!("Input:\n{df_input}");
println!("Null Markers: {null_markers:?}");
let df_output = replace_values_with_null(df_input, null_markers, true)?;
println!("Output:\n{df_output}");
println!("Expected:\n{df_expected}");
assert_eq!(
df_output.schema(),
df_expected.schema(),
"Schemas do not match"
);
assert!(
df_output.equals_missing(&df_expected),
"DataFrames did not match for universal mixed type test."
);
Ok(())
}
#[test]
fn test_replace_values_with_null_string_only() -> PolarsResult<()> {
let df_input = df![
"col1" => &[" A ", "B ", " C", "D", "", " ", "NA", "KEEP", " null "],
"col2" => &[Some(1i32), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(8), Some(9)],
"col3" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true)],
"col4" => &[" NA ", "ignore", "N/A", "None", "999", "", "KEEP", "other", "false"]
]?;
let null_list = &["NA", "", "999", "N/A", "null"];
let df_expected = df![
"col1" => &[Some(" A "), Some("B "), Some(" C"), Some("D"), None, None, None, Some("KEEP"), None],
"col2" => &[Some(1i32), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(8), Some(9)],
"col3" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true)],
"col4" => &[None, Some("ignore"), None, Some("None"), None, None, Some("KEEP"), Some("other"), Some("false")]
]?;
println!("Input:\n{df_input}");
println!("Null List: {null_list:?}");
let df_output = replace_values_with_null(df_input, null_list, false)?; println!("Output:\n{df_output}");
println!("Expected:\n{df_expected}");
assert_eq!(
df_output.schema(),
df_expected.schema(),
"Schemas do not match"
);
assert!(
df_output.equals_missing(&df_expected),
"DataFrames did not match for universal mixed type test."
);
Ok(())
}
#[test]
fn test_universal_whitespace_handling() -> Result<(), PolarsError> {
let df_input = df!(
"col_a" => &[Some(" "), Some("\t\n"), Some("Keep"), Some(" Val "), None, Some("")],
)?;
let null_markers_with_empty = &["", "Val"]; let df_expected_with_empty = df!(
"col_a" => &[None::<&str>, None::<&str>, Some("Keep"), None, None, None],
)?;
let df_output_with_empty = replace_values_with_null(
df_input.clone(), null_markers_with_empty,
true,
)?;
assert!(
df_output_with_empty.equals_missing(&df_expected_with_empty),
"Whitespace not nullified when '' IS targeted.\nOutput:\n{df_output_with_empty:?}\nExpected:\n{df_expected_with_empty:?}"
);
let null_markers_without_empty = &["Val"]; let df_expected_without_empty = df!(
"col_a" => &[Some(" "), Some("\t\n"), Some("Keep"), None, None, Some("")],
)?;
let df_output_without_empty = replace_values_with_null(
df_input.clone(), null_markers_without_empty,
true,
)?;
assert!(
df_output_without_empty.equals_missing(&df_expected_without_empty),
"Whitespace incorrectly nullified when '' NOT targeted.\nOutput:\n{df_output_without_empty:?}\nExpected:\n{df_expected_without_empty:?}"
);
Ok(())
}
#[test]
fn test_string_columns_only() -> Result<(), PolarsError> {
let df_input = create_test_df()?;
let df_expected = df!(
"col_str" => &[Some("Keep"), None, None, None, Some("Value"), None, None, None, None, None],
"col_int" => &[Some(1), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(999), Some(0), Some(10)],
"col_flt" => &[Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5), Some(6.6), Some(7.7), Some(8.8), Some(3.45), Some(10.1)],
"col_bool" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(true), Some(false), Some(true), Some(false), Some(true)],
"col_str_ws" => &[Some(" leading"), Some("trailing "), Some(" both "), None, None, None, None, Some("ok"), None, None]
)?;
println!("Input:\n{df_input}");
println!("Null Markers: {NULL_MARKERS:?}");
let df_output = replace_values_with_null(df_input, NULL_MARKERS, false)?;
println!("Output:\n{df_output}");
println!("Expected:\n{df_expected}");
assert_eq!(df_output, df_expected);
Ok(())
}
#[test]
fn test_all_columns() -> Result<(), PolarsError> {
let df_input = create_test_df()?;
let df_expected = df!(
"col_str" => &[Some("Keep"), None, None, None, Some("Value"), None, None, None, None, None],
"col_int" => &[Some(1), Some(2), None, Some(4), Some(5), Some(6), Some(7), None, Some(0), Some(10)],
"col_flt" => &[Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5), Some(6.6), Some(7.7), Some(8.8), None, Some(10.1)],
"col_bool" => &[Some(true), None, Some(true), None, Some(true), Some(true), None, Some(true), None, Some(true)],
"col_str_ws" => &[Some(" leading"), Some("trailing "), Some(" both "), None, None, None, None, Some("ok"), None, None]
)?;
println!("Input:\n{df_input}");
println!("Null Markers: {NULL_MARKERS:?}");
let df_output = replace_values_with_null(df_input, NULL_MARKERS, true)?;
println!("Output:\n{df_output}");
println!("Expected:\n{df_expected}");
assert_eq!(df_output, df_expected);
Ok(())
}
#[test]
fn test_empty_null_list() -> Result<(), PolarsError> {
let df_orig = create_test_df()?;
println!("df_orig: {df_orig:?}");
let result_str_only = replace_values_with_null(df_orig.clone(), &[], false)?;
let result_all_cols = replace_values_with_null(df_orig.clone(), &[], true)?;
assert_eq!(result_str_only, df_orig);
assert_eq!(result_all_cols, df_orig);
Ok(())
}
#[test]
fn test_no_matches_in_list() -> Result<(), PolarsError> {
let df_orig = create_test_df()?;
println!("df_orig: {df_orig:?}");
let no_match_markers = &["XYZ", "12345", "NO_MATCH"];
let result_str_only = replace_values_with_null(df_orig.clone(), no_match_markers, false)?;
let result_all_cols = replace_values_with_null(df_orig.clone(), no_match_markers, true)?;
assert_eq!(result_str_only, df_orig);
assert_eq!(result_all_cols, df_orig);
Ok(())
}
#[test]
fn test_all_nulls_input() -> Result<(), PolarsError> {
let df = df!(
"a" => &[Option::<i32>::None, None], "b" => &[Option::<&str>::None, None] )?;
println!("df: {df:?}");
let result_str_only = replace_values_with_null(df.clone(), NULL_MARKERS, false)?;
let result_all_cols = replace_values_with_null(df.clone(), NULL_MARKERS, true)?;
assert_eq!(result_str_only, df);
assert_eq!(result_all_cols, df);
Ok(())
}
#[test]
fn test_remove_leading_and_trailing_chars() -> Result<(), PolarsError> {
pub static NULL_VALUES_TEST: [&str; 3] = [
"", "<N/D>", "*DIVERSOS*", ];
let df_input = df! {
"foo" => &["", " ", "hello ", " <N/D> ", " *DIVERSOS* \n ", " world", " \n\r *DIVERSOS* \n ", "<N/D>"],
}?;
println!("df_input: {df_input}");
let series = Series::new("null_vals".into(), NULL_VALUES_TEST);
let null_values_expr: Expr = series.implode()?.into_series().lit();
let condition = all() .as_expr()
.cast(DataType::String) .str()
.strip_chars(lit(NULL)) .is_in(null_values_expr.clone(), true); println!("condition: {condition}");
let replacement_expr: Expr = build_null_expression(null_values_expr, true);
println!("replacement_expr: {replacement_expr}");
let mut df_temp = df_input
.clone()
.lazy()
.with_columns([condition.alias("other name"), replacement_expr]) .collect()?;
df_temp.set_column_names(&["foo_stripped", "is_in condition"])?;
let df_output = concat_df_horizontal(&[df_input, df_temp], true, true, true)?;
println!("df_output: {df_output}");
let vec_from_series: Vec<&str> = df_output
.column("foo_stripped")?
.str()?
.iter() .map(|opt_str| opt_str.unwrap_or("null"))
.collect();
println!("vec_from_series: {vec_from_series:?}");
let vec_from_series: Vec<Option<&str>> = df_output
.column("foo_stripped")?
.str()?
.iter() .collect();
println!("vec_from_series: {vec_from_series:?}");
let df_expected = df! {
"foo" => &["", " ", "hello ", " <N/D> ", " *DIVERSOS* \n ", " world", " \n\r *DIVERSOS* \n ", "<N/D>"],
"foo_stripped" => &[None, None, Some("hello "), None, None, Some(" world"), None, None],
"is_in condition" => &[true, true, false, true, true, false, true, true],
}?;
assert_eq!(
df_output, df_expected,
"DataFrame mismatch after schema modify and null handling"
);
assert_eq!(
df_output.schema(),
df_expected.schema(),
"DataFrame mismatch schema"
);
Ok(())
}
#[test]
fn test_create_list_series_direct_from_series() -> Result<(), PolarsError> {
let null_value_list: &[&str] = &["", "<N/D>", "SIM"];
let inner_series = Series::new("inner_content".into(), null_value_list);
println!("inner_series: {inner_series}");
assert_eq!(inner_series.len(), 3, "The length of the Series must be 3");
let list_series: Series = Series::new("list_string".into(), vec![inner_series]);
println!("list_series: {list_series}");
assert_eq!(list_series.len(), 1, "The length of the Series must be 1");
assert_eq!(
list_series.dtype(),
&DataType::List(Box::new(DataType::String)),
"Resulting series should be List<String>"
);
let result_series: Series = Series::build_from_list(null_value_list);
assert_eq!(result_series.len(), 1); assert_eq!(result_series.name(), "list_string");
assert_eq!(
result_series.dtype(),
&DataType::List(Box::new(DataType::String))
);
assert_eq!(result_series, list_series);
Ok(())
}
}