polars_view/polars/
replace.rs

1use polars::prelude::*;
2
3/// Replaces values with null based on a list of matching strings, with options
4/// to apply to all columns or only string columns.
5///
6/// This function compares values against `null_value_list` and replaces them with `NULL`
7/// upon a match. The comparison behavior depends on the `apply_to_all_columns` flag:
8///
9/// 1.  **If `apply_to_all_columns` is `false` (Default String Behavior):**
10///     *   Operates **only** on columns with `DataType::String`.
11///     *   Trims leading/trailing whitespace from the **original string** value.
12///     *   Compares the trimmed string against `null_value_list`.
13///     *   Non-string columns and non-matching strings are untouched.
14///     *   To nullify empty/whitespace-only strings, include `""` in `null_value_list`.
15///
16/// 2.  **If `apply_to_all_columns` is `true` (Universal Behavior):**
17///     *   Operates on **all** columns in the DataFrame.
18///     *   Casts the value in each column to its **string representation** (`DataType::String`).
19///     *   Trims leading/trailing whitespace from this **string representation**.
20///     *   Compares the trimmed string representation against `null_value_list`.
21///     *   If a match occurs, the **original value** (regardless of type) is replaced with `NULL`.
22///
23/// ### Important Considerations (especially when `apply_to_all_columns = true`):
24///
25/// *   **Trimming:** Whitespace is *always* trimmed before comparison in both modes.
26///     For `apply_to_all_columns = true`, trimming occurs *after* casting to string.
27/// *   **Type Casting:** The universal mode relies on Polars' default casting to String.
28///     Ensure strings in `null_value_list` match the *trimmed* string representation
29///     of numbers, booleans, dates, etc. (e.g., "3.45", "true", "2023-01-01", "NA").
30/// *   **Ambiguity:** A string like "123" in the list might match integer `123`,
31///     float `123.0` (if its string form trims to "123"), and string `" 123 "`.
32/// *   **Complex Types:** Casting complex types (List, Struct, Binary) to String might
33///     yield unpredictable representations or errors. Use with caution.
34/// *   **Performance:** The universal mode (casting all values) can be slower than
35///     the string-only mode on large datasets.
36///
37pub fn replace_values_with_null(
38    dataframe: DataFrame,
39    null_value_list: &[&str],
40    apply_to_all_columns: bool,
41) -> PolarsResult<DataFrame> {
42    // If the list is empty, no replacements are needed.
43    if null_value_list.is_empty() {
44        return Ok(dataframe);
45    }
46
47    // --- Prepare for Matching ---
48
49    let list_series: Series = Series::build_from_list(null_value_list);
50
51    // --- Define Replacement Expr ---
52
53    let replacement_expr: Expr = build_null_expression(list_series.lit(), apply_to_all_columns);
54
55    // --- Apply Transformation ---
56
57    dataframe
58        .lazy()
59        .with_columns([replacement_expr]) // Apply the selected expression
60        .collect() // Execute the lazy plan
61}
62
63/// Define a trait to add building capabilities directly to the Series type.
64pub trait SeriesExtension {
65    /// Builds List<String> Series (shape (1,)) with single list row from input_slice.
66    ///
67    /// ### Arguments
68    /// * `input_slice`: Strings for the list content.
69    /// ### Returns
70    /// A new List<String> Series, shape (1,).
71    fn build_from_list(input_slice: &[&str]) -> Series;
72}
73
74impl SeriesExtension for Series {
75    fn build_from_list(input_slice: &[&str]) -> Series {
76        // Step 1: Create inner Series from slice
77        // (Dtype: String, Shape: (N,)).
78        let inner_series = Series::new("inner_content".into(), input_slice);
79
80        // Step 2: Wrap the inner Series in a vector to create a single-row List Series.
81        // Polars builds a List Series where each Series in the vector becomes one row's list.
82        // By providing a vector with one inner Series, we get a List Series with one row.
83        // (Dtype: List<String>, Shape: (1,)).
84        let list_series: Series = Series::new(
85            "list_string".into(), // Final Series name
86            vec![inner_series],   // Vector with one Series => List Series with one row
87        );
88
89        list_series
90    }
91}
92
93/// Builds a Polars Expression to replace specified string values (after trimming)
94/// with NULL within selected columns of a DataFrame.
95///
96/// Values are replaced if they match any string in the hardcoded list
97/// `null_value_list: Vec<&str>` after trimming leading/trailing whitespace.
98///
99pub fn build_null_expression(null_values_expr: Expr, apply_to_all_columns: bool) -> Expr {
100    // --- Define Replacement Logic based on the flag ---
101    let replacement_expr: Expr = if apply_to_all_columns {
102        // Universal Mode: Apply to ALL columns via casting and trimming string representation
103        let condition = all() // Select current column value
104            .as_expr()
105            .cast(DataType::String) // Cast to String
106            .str()
107            .strip_chars(lit(NULL)) // Trim whitespace from string representation
108            .is_in(null_values_expr, true); // Check if trimmed string is in the list
109
110        when(condition) // WHEN the trimmed string representation matches...
111            .then(lit(NULL)) // THEN replace original value with NULL
112            .otherwise(all()) // OTHERWISE keep the original value
113            .name()
114            .keep() // Keep original column name
115    } else {
116        // String-Only Mode: Apply only to String columns, trim original string
117        let string_cols_selector = dtype_col(&DataType::String).as_selector().as_expr();
118
119        let condition = string_cols_selector // Select only string columns
120            .clone() // Clone needed for use in `otherwise`
121            .str()
122            .strip_chars(lit(NULL)) // Trim whitespace from the original string value
123            .is_in(null_values_expr, true); // Check if trimmed string is in the list
124
125        when(condition) // WHEN the trimmed string matches...
126            // THEN replace with NULL (cast needed for type consistency within String col expr)
127            .then(lit(NULL).cast(DataType::String))
128            // OTHERWISE keep the original string value
129            .otherwise(string_cols_selector)
130            .name()
131            .keep() // Keep original column name
132    };
133
134    replacement_expr
135}
136
137//----------------------------------------------------------------------------//
138//                                   Tests                                    //
139//----------------------------------------------------------------------------//
140
141/// Run tests with:
142/// `cargo test -- --show-output tests_replace_values_with_null`
143#[cfg(test)]
144mod tests_replace_values_with_null {
145    use super::*; // Import the function from the parent module
146    use polars::functions::concat_df_horizontal;
147
148    // Helper to create a consistent test DataFrame using Option for nullability
149    fn create_test_df() -> PolarsResult<DataFrame> {
150        df!(
151            // Use Option<&str> for string columns that can contain nulls
152            "col_str" => &[
153                Some("Keep"), Some(" N/A "), Some("<N/D>"), Some("  "), Some("Value"),
154                None, // Use None for NULL
155                Some("NA"), Some("999"), Some("3.45"), Some("false")
156            ],
157            // Use Option<i32> for integer columns, even if currently no nulls, for consistency
158            "col_int" => &[
159                Some(1), Some(2), Some(999), Some(4), Some(5),
160                Some(6), Some(7), Some(999), Some(0), Some(10)
161            ],
162            // Use Option<f64> for float columns
163            "col_flt" => &[
164                Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5),
165                Some(6.6), Some(7.7), Some(8.8), Some(3.45), Some(10.1)
166            ],
167            // Use Option<bool> for boolean columns
168            "col_bool" => &[
169                Some(true), Some(false), Some(true), Some(false), Some(true),
170                Some(true), Some(false), Some(true), Some(false), Some(true)
171            ],
172            // Use Option<&str> again for this nullable string column
173            "col_str_ws" => &[
174                Some(" leading"), Some("trailing "), Some(" both "), Some(""), Some("NA"),
175                Some("  NA  "),
176                None, // Use None for NULL
177                Some("ok"), Some("999 "), Some(" 3.45")
178            ]
179        )
180    }
181
182    // Define the null markers used in tests
183    const NULL_MARKERS: &[&str] = &["", "<N/D>", "NA", "N/A", "999", "3.45", "false"];
184
185    /// Comprehensive test covering various data types and trimming.
186    /// `cargo test -- --show-output test_universal_replacement_mixed_types`
187    #[test]
188    fn test_universal_replacement_mixed_types() -> Result<(), PolarsError> {
189        // Input DataFrame - df! infers i32 and datetime[ms] here
190        let df_input = df![
191            "col_str" =>    &[Some("Keep"), Some(" NA "), Some("<N/D>"), Some("  "), None, Some("999"), Some("3.45"), Some("false"), Some("2024-01-15")],
192            "col_int" =>    &[Some(123i32), Some(999i32), Some(-10i32), Some(999i32), Some(200i32), Some(0i32), Some(999i32), Some(1i32), Some(2i32)], // Explicit i32
193            "col_float" =>  &[Some(1.1), Some(3.45), Some(-2.2), None, Some(999.0), Some(0.0), Some(123.456), Some(3.450), Some(5.0)], // f64 inferred
194            "col_bool" =>   &[Some(true), Some(false), None, Some(true), Some(false), Some(true), Some(true), Some(false), Some(true)], // bool inferred
195        ]?;
196
197        // Define null markers - **ADJUST DATETIME MARKER**
198        let null_markers = &[
199            "",      // Matches "  " after trimming
200            "NA",    // Matches " NA " after trimming
201            "<N/D>", // Exact match
202            "999",   // Will match integer 999 and string "999"
203            "3.45",  // Will match float 3.45 and string "3.45"
204            "false", // Matches relevant bools/strings
205            "2024-01-15",
206        ];
207
208        // Expected DataFrame - let df! infer types matching input (i32, datetime[ms])
209        let df_expected = df![
210             "col_str" =>    &[Some("Keep"), None, None, None, None, None, None, None, None],
211             "col_int" =>    &[Some(123i32), None, Some(-10i32), None, Some(200i32), Some(0i32), None, Some(1i32), Some(2i32)], // Use i32
212             "col_float" =>  &[Some(1.1), None, Some(-2.2), None, Some(999.0), Some(0.0), Some(123.456), None, Some(5.0)],
213             "col_bool" =>   &[Some(true), None, None, Some(true), None, Some(true), Some(true), None, Some(true)],
214        ]?;
215
216        println!("Input:\n{df_input}");
217        println!("Null Markers: {null_markers:?}");
218        let df_output = replace_values_with_null(df_input, null_markers, true)?;
219        println!("Output:\n{df_output}");
220        println!("Expected:\n{df_expected}");
221
222        // Compare schema and values
223        assert_eq!(
224            df_output.schema(),
225            df_expected.schema(),
226            "Schemas do not match"
227        );
228        assert!(
229            df_output.equals_missing(&df_expected),
230            "DataFrames did not match for universal mixed type test."
231        );
232
233        Ok(())
234    }
235
236    /// cargo test -- --show-output test_replace_values_with_null_string_only
237    #[test]
238    fn test_replace_values_with_null_string_only() -> PolarsResult<()> {
239        let df_input = df![
240        "col1" => &[" A ", "B ", " C", "D", "", " ", "NA", "KEEP", " null "],
241        "col2" => &[Some(1i32), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(8), Some(9)],
242        "col3" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true)],
243        "col4" => &[" NA ", "ignore", "N/A", "None", "999", "", "KEEP", "other", "false"]
244        ]?;
245
246        let null_list = &["NA", "", "999", "N/A", "null"]; // Include "999" as string, "" for whitespace trimming check
247
248        let df_expected = df![
249        "col1" => &[Some(" A "), Some("B "), Some(" C"), Some("D"), None, None, None, Some("KEEP"), None],
250        "col2" => &[Some(1i32), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(8), Some(9)],
251        "col3" => &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true), Some(false), Some(true)],
252        "col4" => &[None, Some("ignore"), None, Some("None"), None, None, Some("KEEP"), Some("other"), Some("false")]
253        ]?;
254
255        println!("Input:\n{df_input}");
256        println!("Null List: {null_list:?}");
257        let df_output = replace_values_with_null(df_input, null_list, false)?; // string-only
258        println!("Output:\n{df_output}");
259        println!("Expected:\n{df_expected}");
260
261        // Compare schema and values
262        assert_eq!(
263            df_output.schema(),
264            df_expected.schema(),
265            "Schemas do not match"
266        );
267        assert!(
268            df_output.equals_missing(&df_expected),
269            "DataFrames did not match for universal mixed type test."
270        );
271
272        Ok(())
273    }
274
275    /// Test whitespace-only strings specifically.
276    /// `cargo test -- --show-output test_universal_whitespace_handling`
277    #[test]
278    fn test_universal_whitespace_handling() -> Result<(), PolarsError> {
279        let df_input = df!(
280            "col_a" => &[Some("   "), Some("\t\n"), Some("Keep"), Some(" Val "), None, Some("")],
281        )?;
282
283        // Case 1: Target empty string "" -> whitespace should be nullified
284        let null_markers_with_empty = &["", "Val"]; // Match empty string and "Val" (after trim)
285        let df_expected_with_empty = df!(
286            "col_a" => &[None::<&str>, None::<&str>, Some("Keep"), None, None, None],
287        )?;
288        let df_output_with_empty = replace_values_with_null(
289            df_input.clone(), // Clone input for the first case
290            null_markers_with_empty,
291            true,
292        )?;
293        assert!(
294            df_output_with_empty.equals_missing(&df_expected_with_empty),
295            "Whitespace not nullified when '' IS targeted.\nOutput:\n{df_output_with_empty:?}\nExpected:\n{df_expected_with_empty:?}"
296        );
297
298        // Case 2: Do NOT target empty string "" -> whitespace should NOT be nullified
299        let null_markers_without_empty = &["Val"]; // Only target "Val" (after trim)
300        let df_expected_without_empty = df!(
301            // "   ", "\t\n", "" remain because "" is not targeted after trimming them
302            "col_a" => &[Some("   "), Some("\t\n"), Some("Keep"), None, None, Some("")],
303        )?;
304        let df_output_without_empty = replace_values_with_null(
305            df_input.clone(), // Clone input for the second case
306            null_markers_without_empty,
307            true,
308        )?;
309        assert!(
310            df_output_without_empty.equals_missing(&df_expected_without_empty),
311            "Whitespace incorrectly nullified when '' NOT targeted.\nOutput:\n{df_output_without_empty:?}\nExpected:\n{df_expected_without_empty:?}"
312        );
313
314        Ok(())
315    }
316
317    /// `cargo test -- --show-output test_string_columns_only`
318    #[test]
319    fn test_string_columns_only() -> Result<(), PolarsError> {
320        let df_input = create_test_df()?;
321
322        // Expected DataFrame now also uses Option for clarity, matching the input style
323        let df_expected = df!(
324            "col_str" =>    &[Some("Keep"), None, None, None, Some("Value"), None, None, None, None, None],
325            "col_int" =>    &[Some(1), Some(2), Some(999), Some(4), Some(5), Some(6), Some(7), Some(999), Some(0), Some(10)],
326            "col_flt" =>    &[Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5), Some(6.6), Some(7.7), Some(8.8), Some(3.45), Some(10.1)],
327            "col_bool" =>   &[Some(true), Some(false), Some(true), Some(false), Some(true), Some(true), Some(false), Some(true), Some(false), Some(true)],
328            "col_str_ws" => &[Some(" leading"), Some("trailing "), Some(" both "), None, None, None, None, Some("ok"), None, None]
329        )?;
330
331        println!("Input:\n{df_input}");
332        println!("Null Markers: {NULL_MARKERS:?}");
333        let df_output = replace_values_with_null(df_input, NULL_MARKERS, false)?;
334        println!("Output:\n{df_output}");
335        println!("Expected:\n{df_expected}");
336
337        assert_eq!(df_output, df_expected);
338        Ok(())
339    }
340
341    /// `cargo test -- --show-output test_all_columns`
342    #[test]
343    fn test_all_columns() -> Result<(), PolarsError> {
344        let df_input = create_test_df()?;
345
346        let df_expected = df!(
347            "col_str" =>    &[Some("Keep"), None, None, None, Some("Value"), None, None, None, None, None],
348            "col_int" =>    &[Some(1), Some(2), None, Some(4), Some(5), Some(6), Some(7), None, Some(0), Some(10)],
349            "col_flt" =>    &[Some(1.1), Some(2.2), Some(999.0), Some(999.1), Some(5.5), Some(6.6), Some(7.7), Some(8.8), None, Some(10.1)],
350            "col_bool" =>   &[Some(true), None, Some(true), None, Some(true), Some(true), None, Some(true), None, Some(true)],
351            "col_str_ws" => &[Some(" leading"), Some("trailing "), Some(" both "), None, None, None, None, Some("ok"), None, None]
352        )?;
353
354        println!("Input:\n{df_input}");
355        println!("Null Markers: {NULL_MARKERS:?}");
356        let df_output = replace_values_with_null(df_input, NULL_MARKERS, true)?;
357        println!("Output:\n{df_output}");
358        println!("Expected:\n{df_expected}");
359
360        assert_eq!(df_output, df_expected);
361        Ok(())
362    }
363
364    #[test]
365    fn test_empty_null_list() -> Result<(), PolarsError> {
366        let df_orig = create_test_df()?;
367        println!("df_orig: {df_orig:?}");
368
369        let result_str_only = replace_values_with_null(df_orig.clone(), &[], false)?;
370        let result_all_cols = replace_values_with_null(df_orig.clone(), &[], true)?;
371
372        assert_eq!(result_str_only, df_orig);
373        assert_eq!(result_all_cols, df_orig);
374
375        Ok(())
376    }
377
378    #[test]
379    fn test_no_matches_in_list() -> Result<(), PolarsError> {
380        let df_orig = create_test_df()?;
381        println!("df_orig: {df_orig:?}");
382
383        let no_match_markers = &["XYZ", "12345", "NO_MATCH"];
384
385        let result_str_only = replace_values_with_null(df_orig.clone(), no_match_markers, false)?;
386        let result_all_cols = replace_values_with_null(df_orig.clone(), no_match_markers, true)?;
387
388        assert_eq!(result_str_only, df_orig);
389        assert_eq!(result_all_cols, df_orig);
390
391        Ok(())
392    }
393
394    #[test]
395    fn test_all_nulls_input() -> Result<(), PolarsError> {
396        let df = df!(
397            "a" => &[Option::<i32>::None, None], // Already using Option correctly here
398            "b" => &[Option::<&str>::None, None] // Already using Option correctly here
399        )?;
400        println!("df: {df:?}");
401
402        let result_str_only = replace_values_with_null(df.clone(), NULL_MARKERS, false)?;
403        let result_all_cols = replace_values_with_null(df.clone(), NULL_MARKERS, true)?;
404
405        assert_eq!(result_str_only, df);
406        assert_eq!(result_all_cols, df);
407        Ok(())
408    }
409
410    /// cargo test -- --show-output test_remove_leading_and_trailing_chars
411    #[test]
412    fn test_remove_leading_and_trailing_chars() -> Result<(), PolarsError> {
413        /// Define values to be interpreted as null across all columns.
414        pub static NULL_VALUES_TEST: [&str; 3] = [
415            "",           // Represents empty strings --> null
416            "<N/D>",      // Specific placeholder string 1
417            "*DIVERSOS*", // Specific placeholder string 2
418        ];
419
420        let df_input = df! {
421            "foo" => &["", " ", "hello ", " <N/D> ", " *DIVERSOS* \n ", " world", " \n\r *DIVERSOS* \n ", "<N/D>"],
422        }?;
423
424        println!("df_input: {df_input}");
425
426        // Create a Polars Series containing the *strings* to be treated as null markers.
427        let series = Series::new("null_vals".into(), NULL_VALUES_TEST);
428        let null_values_expr: Expr = series.implode()?.into_series().lit();
429
430        let condition = all() // Select current column value
431            .as_expr()
432            .cast(DataType::String) // Cast to String
433            .str()
434            .strip_chars(lit(NULL)) // Trim whitespace from string representation
435            .is_in(null_values_expr.clone(), true); // Check if trimmed string is in the list
436        println!("condition: {condition}");
437
438        let replacement_expr: Expr = build_null_expression(null_values_expr, true);
439        println!("replacement_expr: {replacement_expr}");
440
441        let mut df_temp = df_input
442            .clone()
443            .lazy()
444            .with_columns([condition.alias("other name"), replacement_expr]) // Apply the selected expression
445            .collect()?;
446        df_temp.set_column_names(["foo_stripped", "is_in condition"])?;
447
448        // Concat DataFrames horizontally.
449        // let df_output = df_input.hstack(df_temp.get_columns())?;
450        let df_output = concat_df_horizontal(&[df_input, df_temp], true)?;
451
452        println!("df_output: {df_output}");
453
454        let vec_from_series: Vec<&str> = df_output
455            .column("foo_stripped")?
456            .str()?
457            .iter() // Iterator over Option<&str>
458            .map(|opt_str| opt_str.unwrap_or("null"))
459            .collect();
460
461        println!("vec_from_series: {vec_from_series:?}");
462
463        let vec_from_series: Vec<Option<&str>> = df_output
464            .column("foo_stripped")?
465            .str()?
466            .iter() // Iterator over Option<&str>
467            .collect();
468
469        println!("vec_from_series: {vec_from_series:?}");
470
471        let df_expected = df! {
472            "foo" => &["", " ", "hello ", " <N/D> ", " *DIVERSOS* \n ", " world", " \n\r *DIVERSOS* \n ", "<N/D>"],
473            "foo_stripped" => &[None, None, Some("hello "), None, None, Some(" world"), None, None],
474            "is_in condition" => &[true, true, false, true, true, false, true, true],
475        }?;
476
477        assert_eq!(
478            df_output, df_expected,
479            "DataFrame mismatch after schema modify and null handling"
480        );
481
482        assert_eq!(
483            df_output.schema(),
484            df_expected.schema(),
485            "DataFrame mismatch schema"
486        );
487
488        Ok(())
489    }
490
491    #[test]
492    /// cargo test -- --show-output test_create_list_series_direct_from_series
493    fn test_create_list_series_direct_from_series() -> Result<(), PolarsError> {
494        // Define input data for the list
495        let null_value_list: &[&str] = &["", "<N/D>", "SIM"];
496
497        // Step 1: Create the inner String Series from the data.
498        let inner_series = Series::new("inner_content".into(), null_value_list);
499        println!("inner_series: {inner_series}");
500
501        assert_eq!(inner_series.len(), 3, "The length of the Series must be 3");
502
503        // Step 2: Create the List Series using a Vec containing the inner Series.
504        let list_series: Series = Series::new("list_string".into(), vec![inner_series]);
505        println!("list_series: {list_series}");
506
507        assert_eq!(list_series.len(), 1, "The length of the Series must be 1");
508
509        assert_eq!(
510            list_series.dtype(),
511            &DataType::List(Box::new(DataType::String)),
512            "Resulting series should be List<String>"
513        );
514
515        let result_series: Series = Series::build_from_list(null_value_list);
516
517        assert_eq!(result_series.len(), 1); // List series itself has 1 row
518        assert_eq!(result_series.name(), "list_string");
519        assert_eq!(
520            result_series.dtype(),
521            &DataType::List(Box::new(DataType::String))
522        );
523        assert_eq!(result_series, list_series);
524
525        Ok(())
526    }
527}