1use polars::prelude::*;
2
3pub fn replace_values_with_null(
38 dataframe: DataFrame,
39 null_value_list: &[&str],
40 apply_to_all_columns: bool,
41) -> PolarsResult<DataFrame> {
42 if null_value_list.is_empty() {
44 return Ok(dataframe);
45 }
46
47 let list_series: Series = Series::build_from_list(null_value_list);
50
51 let replacement_expr: Expr = build_null_expression(list_series.lit(), apply_to_all_columns);
54
55 dataframe
58 .lazy()
59 .with_columns([replacement_expr]) .collect() }
62
63pub trait SeriesExtension {
65 fn build_from_list(input_slice: &[&str]) -> Series;
72}
73
74impl SeriesExtension for Series {
75 fn build_from_list(input_slice: &[&str]) -> Series {
76 let inner_series = Series::new("inner_content".into(), input_slice);
79
80 let list_series: Series = Series::new(
85 "list_string".into(), vec![inner_series], );
88
89 list_series
90 }
91}
92
93pub fn build_null_expression(null_values_expr: Expr, apply_to_all_columns: bool) -> Expr {
100 let replacement_expr: Expr = if apply_to_all_columns {
102 let condition = all() .as_expr()
105 .cast(DataType::String) .str()
107 .strip_chars(lit(NULL)) .is_in(null_values_expr, true); when(condition) .then(lit(NULL)) .otherwise(all()) .name()
114 .keep() } else {
116 let string_cols_selector = dtype_col(&DataType::String).as_selector().as_expr();
118
119 let condition = string_cols_selector .clone() .str()
122 .strip_chars(lit(NULL)) .is_in(null_values_expr, true); when(condition) .then(lit(NULL).cast(DataType::String))
128 .otherwise(string_cols_selector)
130 .name()
131 .keep() };
133
134 replacement_expr
135}
136
137#[cfg(test)]
144mod tests_replace_values_with_null {
145 use super::*; use polars::functions::concat_df_horizontal;
147
148 fn create_test_df() -> PolarsResult<DataFrame> {
150 df!(
151 "col_str" => &[
153 Some("Keep"), Some(" N/A "), Some("<N/D>"), Some(" "), Some("Value"),
154 None, Some("NA"), Some("999"), Some("3.45"), Some("false")
156 ],
157 "col_int" => &[
159 Some(1), Some(2), Some(999), Some(4), Some(5),
160 Some(6), Some(7), Some(999), Some(0), Some(10)
161 ],
162 "col_flt" => &[
164 Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5),
165 Some(6.6), Some(7.7), Some(8.8), Some(3.45), Some(10.1)
166 ],
167 "col_bool" => &[
169 Some(true), Some(false), Some(true), Some(false), Some(true),
170 Some(true), Some(false), Some(true), Some(false), Some(true)
171 ],
172 "col_str_ws" => &[
174 Some(" leading"), Some("trailing "), Some(" both "), Some(""), Some("NA"),
175 Some(" NA "),
176 None, Some("ok"), Some("999 "), Some(" 3.45")
178 ]
179 )
180 }
181
182 const NULL_MARKERS: &[&str] = &["", "<N/D>", "NA", "N/A", "999", "3.45", "false"];
184
185 #[test]
188 fn test_universal_replacement_mixed_types() -> Result<(), PolarsError> {
189 let df_input = df![
191 "col_str" => &[Some("Keep"), Some(" NA "), Some("<N/D>"), Some(" "), None, Some("999"), Some("3.45"), Some("false"), Some("2024-01-15")],
192 "col_int" => &[Some(123i32), Some(999i32), Some(-10i32), Some(999i32), Some(200i32), Some(0i32), Some(999i32), Some(1i32), Some(2i32)], "col_float" => &[Some(1.1), Some(3.45), Some(-2.2), None, Some(999.0), Some(0.0), Some(123.456), Some(3.450), Some(5.0)], "col_bool" => &[Some(true), Some(false), None, Some(true), Some(false), Some(true), Some(true), Some(false), Some(true)], ]?;
196
197 let null_markers = &[
199 "", "NA", "<N/D>", "999", "3.45", "false", "2024-01-15",
206 ];
207
208 let df_expected = df![
210 "col_str" => &[Some("Keep"), None, None, None, None, None, None, None, None],
211 "col_int" => &[Some(123i32), None, Some(-10i32), None, Some(200i32), Some(0i32), None, Some(1i32), Some(2i32)], "col_float" => &[Some(1.1), None, Some(-2.2), None, Some(999.0), Some(0.0), Some(123.456), None, Some(5.0)],
213 "col_bool" => &[Some(true), None, None, Some(true), None, Some(true), Some(true), None, Some(true)],
214 ]?;
215
216 println!("Input:\n{df_input}");
217 println!("Null Markers: {null_markers:?}");
218 let df_output = replace_values_with_null(df_input, null_markers, true)?;
219 println!("Output:\n{df_output}");
220 println!("Expected:\n{df_expected}");
221
222 assert_eq!(
224 df_output.schema(),
225 df_expected.schema(),
226 "Schemas do not match"
227 );
228 assert!(
229 df_output.equals_missing(&df_expected),
230 "DataFrames did not match for universal mixed type test."
231 );
232
233 Ok(())
234 }
235
236 #[test]
238 fn test_replace_values_with_null_string_only() -> PolarsResult<()> {
239 let df_input = df![
240 "col1" => &[" A ", "B ", " C", "D", "", " ", "NA", "KEEP", " null "],
241 "col2" => &[Some(1i32), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(8), Some(9)],
242 "col3" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true)],
243 "col4" => &[" NA ", "ignore", "N/A", "None", "999", "", "KEEP", "other", "false"]
244 ]?;
245
246 let null_list = &["NA", "", "999", "N/A", "null"]; let df_expected = df![
249 "col1" => &[Some(" A "), Some("B "), Some(" C"), Some("D"), None, None, None, Some("KEEP"), None],
250 "col2" => &[Some(1i32), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(8), Some(9)],
251 "col3" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true)],
252 "col4" => &[None, Some("ignore"), None, Some("None"), None, None, Some("KEEP"), Some("other"), Some("false")]
253 ]?;
254
255 println!("Input:\n{df_input}");
256 println!("Null List: {null_list:?}");
257 let df_output = replace_values_with_null(df_input, null_list, false)?; println!("Output:\n{df_output}");
259 println!("Expected:\n{df_expected}");
260
261 assert_eq!(
263 df_output.schema(),
264 df_expected.schema(),
265 "Schemas do not match"
266 );
267 assert!(
268 df_output.equals_missing(&df_expected),
269 "DataFrames did not match for universal mixed type test."
270 );
271
272 Ok(())
273 }
274
275 #[test]
278 fn test_universal_whitespace_handling() -> Result<(), PolarsError> {
279 let df_input = df!(
280 "col_a" => &[Some(" "), Some("\t\n"), Some("Keep"), Some(" Val "), None, Some("")],
281 )?;
282
283 let null_markers_with_empty = &["", "Val"]; let df_expected_with_empty = df!(
286 "col_a" => &[None::<&str>, None::<&str>, Some("Keep"), None, None, None],
287 )?;
288 let df_output_with_empty = replace_values_with_null(
289 df_input.clone(), null_markers_with_empty,
291 true,
292 )?;
293 assert!(
294 df_output_with_empty.equals_missing(&df_expected_with_empty),
295 "Whitespace not nullified when '' IS targeted.\nOutput:\n{df_output_with_empty:?}\nExpected:\n{df_expected_with_empty:?}"
296 );
297
298 let null_markers_without_empty = &["Val"]; let df_expected_without_empty = df!(
301 "col_a" => &[Some(" "), Some("\t\n"), Some("Keep"), None, None, Some("")],
303 )?;
304 let df_output_without_empty = replace_values_with_null(
305 df_input.clone(), null_markers_without_empty,
307 true,
308 )?;
309 assert!(
310 df_output_without_empty.equals_missing(&df_expected_without_empty),
311 "Whitespace incorrectly nullified when '' NOT targeted.\nOutput:\n{df_output_without_empty:?}\nExpected:\n{df_expected_without_empty:?}"
312 );
313
314 Ok(())
315 }
316
317 #[test]
319 fn test_string_columns_only() -> Result<(), PolarsError> {
320 let df_input = create_test_df()?;
321
322 let df_expected = df!(
324 "col_str" => &[Some("Keep"), None, None, None, Some("Value"), None, None, None, None, None],
325 "col_int" => &[Some(1), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(999), Some(0), Some(10)],
326 "col_flt" => &[Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5), Some(6.6), Some(7.7), Some(8.8), Some(3.45), Some(10.1)],
327 "col_bool" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(true), Some(false), Some(true), Some(false), Some(true)],
328 "col_str_ws" => &[Some(" leading"), Some("trailing "), Some(" both "), None, None, None, None, Some("ok"), None, None]
329 )?;
330
331 println!("Input:\n{df_input}");
332 println!("Null Markers: {NULL_MARKERS:?}");
333 let df_output = replace_values_with_null(df_input, NULL_MARKERS, false)?;
334 println!("Output:\n{df_output}");
335 println!("Expected:\n{df_expected}");
336
337 assert_eq!(df_output, df_expected);
338 Ok(())
339 }
340
341 #[test]
343 fn test_all_columns() -> Result<(), PolarsError> {
344 let df_input = create_test_df()?;
345
346 let df_expected = df!(
347 "col_str" => &[Some("Keep"), None, None, None, Some("Value"), None, None, None, None, None],
348 "col_int" => &[Some(1), Some(2), None, Some(4), Some(5), Some(6), Some(7), None, Some(0), Some(10)],
349 "col_flt" => &[Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5), Some(6.6), Some(7.7), Some(8.8), None, Some(10.1)],
350 "col_bool" => &[Some(true), None, Some(true), None, Some(true), Some(true), None, Some(true), None, Some(true)],
351 "col_str_ws" => &[Some(" leading"), Some("trailing "), Some(" both "), None, None, None, None, Some("ok"), None, None]
352 )?;
353
354 println!("Input:\n{df_input}");
355 println!("Null Markers: {NULL_MARKERS:?}");
356 let df_output = replace_values_with_null(df_input, NULL_MARKERS, true)?;
357 println!("Output:\n{df_output}");
358 println!("Expected:\n{df_expected}");
359
360 assert_eq!(df_output, df_expected);
361 Ok(())
362 }
363
364 #[test]
365 fn test_empty_null_list() -> Result<(), PolarsError> {
366 let df_orig = create_test_df()?;
367 println!("df_orig: {df_orig:?}");
368
369 let result_str_only = replace_values_with_null(df_orig.clone(), &[], false)?;
370 let result_all_cols = replace_values_with_null(df_orig.clone(), &[], true)?;
371
372 assert_eq!(result_str_only, df_orig);
373 assert_eq!(result_all_cols, df_orig);
374
375 Ok(())
376 }
377
378 #[test]
379 fn test_no_matches_in_list() -> Result<(), PolarsError> {
380 let df_orig = create_test_df()?;
381 println!("df_orig: {df_orig:?}");
382
383 let no_match_markers = &["XYZ", "12345", "NO_MATCH"];
384
385 let result_str_only = replace_values_with_null(df_orig.clone(), no_match_markers, false)?;
386 let result_all_cols = replace_values_with_null(df_orig.clone(), no_match_markers, true)?;
387
388 assert_eq!(result_str_only, df_orig);
389 assert_eq!(result_all_cols, df_orig);
390
391 Ok(())
392 }
393
394 #[test]
395 fn test_all_nulls_input() -> Result<(), PolarsError> {
396 let df = df!(
397 "a" => &[Option::<i32>::None, None], "b" => &[Option::<&str>::None, None] )?;
400 println!("df: {df:?}");
401
402 let result_str_only = replace_values_with_null(df.clone(), NULL_MARKERS, false)?;
403 let result_all_cols = replace_values_with_null(df.clone(), NULL_MARKERS, true)?;
404
405 assert_eq!(result_str_only, df);
406 assert_eq!(result_all_cols, df);
407 Ok(())
408 }
409
410 #[test]
412 fn test_remove_leading_and_trailing_chars() -> Result<(), PolarsError> {
413 pub static NULL_VALUES_TEST: [&str; 3] = [
415 "", "<N/D>", "*DIVERSOS*", ];
419
420 let df_input = df! {
421 "foo" => &["", " ", "hello ", " <N/D> ", " *DIVERSOS* \n ", " world", " \n\r *DIVERSOS* \n ", "<N/D>"],
422 }?;
423
424 println!("df_input: {df_input}");
425
426 let series = Series::new("null_vals".into(), NULL_VALUES_TEST);
428 let null_values_expr: Expr = series.implode()?.into_series().lit();
429
430 let condition = all() .as_expr()
432 .cast(DataType::String) .str()
434 .strip_chars(lit(NULL)) .is_in(null_values_expr.clone(), true); println!("condition: {condition}");
437
438 let replacement_expr: Expr = build_null_expression(null_values_expr, true);
439 println!("replacement_expr: {replacement_expr}");
440
441 let mut df_temp = df_input
442 .clone()
443 .lazy()
444 .with_columns([condition.alias("other name"), replacement_expr]) .collect()?;
446 df_temp.set_column_names(["foo_stripped", "is_in condition"])?;
447
448 let df_output = concat_df_horizontal(&[df_input, df_temp], true)?;
451
452 println!("df_output: {df_output}");
453
454 let vec_from_series: Vec<&str> = df_output
455 .column("foo_stripped")?
456 .str()?
457 .iter() .map(|opt_str| opt_str.unwrap_or("null"))
459 .collect();
460
461 println!("vec_from_series: {vec_from_series:?}");
462
463 let vec_from_series: Vec<Option<&str>> = df_output
464 .column("foo_stripped")?
465 .str()?
466 .iter() .collect();
468
469 println!("vec_from_series: {vec_from_series:?}");
470
471 let df_expected = df! {
472 "foo" => &["", " ", "hello ", " <N/D> ", " *DIVERSOS* \n ", " world", " \n\r *DIVERSOS* \n ", "<N/D>"],
473 "foo_stripped" => &[None, None, Some("hello "), None, None, Some(" world"), None, None],
474 "is_in condition" => &[true, true, false, true, true, false, true, true],
475 }?;
476
477 assert_eq!(
478 df_output, df_expected,
479 "DataFrame mismatch after schema modify and null handling"
480 );
481
482 assert_eq!(
483 df_output.schema(),
484 df_expected.schema(),
485 "DataFrame mismatch schema"
486 );
487
488 Ok(())
489 }
490
491 #[test]
492 fn test_create_list_series_direct_from_series() -> Result<(), PolarsError> {
494 let null_value_list: &[&str] = &["", "<N/D>", "SIM"];
496
497 let inner_series = Series::new("inner_content".into(), null_value_list);
499 println!("inner_series: {inner_series}");
500
501 assert_eq!(inner_series.len(), 3, "The length of the Series must be 3");
502
503 let list_series: Series = Series::new("list_string".into(), vec![inner_series]);
505 println!("list_series: {list_series}");
506
507 assert_eq!(list_series.len(), 1, "The length of the Series must be 1");
508
509 assert_eq!(
510 list_series.dtype(),
511 &DataType::List(Box::new(DataType::String)),
512 "Resulting series should be List<String>"
513 );
514
515 let result_series: Series = Series::build_from_list(null_value_list);
516
517 assert_eq!(result_series.len(), 1); assert_eq!(result_series.name(), "list_string");
519 assert_eq!(
520 result_series.dtype(),
521 &DataType::List(Box::new(DataType::String))
522 );
523 assert_eq!(result_series, list_series);
524
525 Ok(())
526 }
527}