use crate::{PolarsViewError, PolarsViewResult};
use polars::prelude::*;
use regex::Regex;
pub fn normalize_float_strings_by_regex(
df: DataFrame,
regex_pattern: &str,
) -> PolarsViewResult<DataFrame> {
let compiled_regex: Option<Regex> = match regex_pattern {
"*" => None,
pattern => {
if !(pattern.starts_with('^') && pattern.ends_with('$')) {
return Err(PolarsViewError::InvalidRegexPattern(pattern.to_string()));
}
match Regex::new(pattern) {
Ok(re) => Some(re),
Err(e) => {
return Err(PolarsViewError::InvalidRegexSyntax {
pattern: pattern.to_string(),
error: e.to_string(),
});
}
}
}
};
tracing::debug!("Compiled regex (None for wildcard): {:?}", compiled_regex);
let schema = df.schema();
let mut columns_to_transform: Vec<PlSmallStr> = Vec::new();
let mut error_columns_mismatched_type = Vec::new();
for (col_name, dtype) in schema.iter() {
let name_matches_pattern = match &compiled_regex {
Some(re) => re.is_match(col_name),
None => true, };
if name_matches_pattern {
if dtype == &DataType::String {
columns_to_transform.push(col_name.clone());
} else {
error_columns_mismatched_type.push(format!("'{col_name}' (Type: {dtype})"));
}
}
}
if !error_columns_mismatched_type.is_empty() {
return Err(PolarsViewError::InvalidDataTypeForRegex {
pattern: regex_pattern.to_string(),
columns: error_columns_mismatched_type,
});
}
if columns_to_transform.is_empty() {
tracing::debug!(
"No string columns matching regex '{}' found for normalization.",
regex_pattern
);
return Ok(df); }
tracing::debug!(
"Applying normalization to columns: {:?}",
columns_to_transform
);
df.lazy()
.with_columns([
cols(columns_to_transform) .as_expr()
.str()
.replace_all(lit("."), lit(""), true) .str()
.replace_all(lit(","), lit("."), true) .cast(DataType::Float64), ])
.collect() .map_err(PolarsViewError::from) }
#[cfg(test)]
mod tests_normalize_float_strings {
use super::*;
fn assert_df_equal(df_output: &DataFrame, df_expected: &DataFrame, context: &str) {
assert!(
df_output.equals_missing(df_expected),
"\nAssertion Failed: {context}\nOutput DF:\n{df_output}\nExpected DF:\n{df_expected}\n"
);
}
fn create_shared_df() -> PolarsViewResult<DataFrame> {
df!(
"ID" => &[1, 2, 3, 4, 5, 6, 7, 8], "Value_PT_1" => &[
Some("1.234,56"), Some("78,90"), Some("1.000"), Some("-10,0"), Some("500,"), None, Some("0,1"), Some("10") ],
"Description" => &[
Some("A"), Some("B"), Some("C"), Some("D"),
Some("E"), None, Some("G"), Some("H") ],
"Value_PT_2" => &[
Some("-1,0"), Some("2.000,5"), Some("3,00"), Some("1."), Some("9.999,99"), Some("123"), None, Some("") ],
"Value_US" => &[
Some("1,234.56"), Some("78.90"), Some("1,000"), Some("-10.0"), Some("500."), None, Some("0.1"), Some("10") ],
"Mixed_Data" => &[
Some("1,0"),
Some("Invalid"),
None,
Some(""),
Some("-1.000,5"),
Some(",5"),
Some("."),
Some("1.2.3,4.5")
],
"Already_F64" => &[10.1, 20.2, 30.3, 40.4, 50.5, 60.6, 70.7, 80.8]
)
.map_err(PolarsViewError::from) }
#[test]
fn test_normalize_single_pt_column() -> PolarsViewResult<()> {
let df_input = create_shared_df()?;
let df_expected = df!(
"ID" => &[1, 2, 3, 4, 5, 6, 7, 8],
"Value_PT_1" => &[Some(1234.56), Some(78.90), Some(1000.0), Some(-10.0), Some(500.0), None, Some(0.1), Some(10.0)],
"Description" => &[
Some("A"), Some("B"), Some("C"), Some("D"),
Some("E"), None, Some("G"), Some("H")
],
"Value_PT_2" => &[
Some("-1,0"), Some("2.000,5"), Some("3,00"), Some("1."),
Some("9.999,99"), Some("123"), None, Some("")
],
"Value_US" => &[
Some("1,234.56"), Some("78.90"), Some("1,000"), Some("-10.0"),
Some("500."), None, Some("0.1"), Some("10")
],
"Mixed_Data" => &[
Some("1,0"), Some("Invalid"), None, Some(""),
Some("-1.000,5"), Some(",5"), Some("."), Some("1.2.3,4.5")
],
"Already_F64" => &[10.1, 20.2, 30.3, 40.4, 50.5, 60.6, 70.7, 80.8]
)?
.lazy()
.with_column(col("Value_PT_1").cast(DataType::Float64))
.collect()?;
let regex = "^Value_PT_1$";
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = normalize_float_strings_by_regex(df_input.clone(), regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(&df_output, &df_expected, "Single PT column normalization");
assert_eq!(df_output.schema(), df_expected.schema());
Ok(())
}
#[test]
fn test_normalize_multiple_value_columns() -> PolarsViewResult<()> {
let df_input = create_shared_df()?;
let df_expected = df!(
"ID" => &[1, 2, 3, 4, 5, 6, 7, 8],
"Value_PT_1" => &[Some(1234.56), Some(78.90), Some(1000.0), Some(-10.0), Some(500.0), None, Some(0.1), Some(10.0)], "Description" => &[ Some("A"), Some("B"), Some("C"), Some("D"), Some("E"), None, Some("G"), Some("H")], "Value_PT_2" => &[Some(-1.0), Some(2000.5), Some(3.0), Some(1.0), Some(9999.99), Some(123.0), None, None], "Value_US" => &[Some(1.23456), Some(7890.0), Some(1.0), Some(-100.0), Some(500.0), None, Some(1.0), Some(10.0)], "Mixed_Data" => &[Some("1,0"), Some("Invalid"), None, Some(""), Some("-1.000,5"), Some(",5"), Some("."), Some("1.2.3,4.5")], "Already_F64" => &[10.1, 20.2, 30.3, 40.4, 50.5, 60.6, 70.7, 80.8] )?
.lazy()
.with_columns(vec![
col("Value_PT_1").cast(DataType::Float64),
col("Value_PT_2").cast(DataType::Float64),
col("Value_US").cast(DataType::Float64),
])
.collect()?;
let regex = "^Value_.*$";
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = normalize_float_strings_by_regex(df_input.clone(), regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(
&df_output,
&df_expected,
"Multiple Value_* column normalization",
);
assert_eq!(df_output.schema(), df_expected.schema());
Ok(())
}
#[test]
fn test_normalize_mixed_data_column() -> PolarsViewResult<()> {
let df_input = create_shared_df()?;
let df_expected = df!(
"ID" => &[1, 2, 3, 4, 5, 6, 7, 8],
"Value_PT_1" => &[Some("1.234,56"), Some("78,90"), Some("1.000"), Some("-10,0"), Some("500,"), None, Some("0,1"), Some("10")], "Description" => &[ Some("A"), Some("B"), Some("C"), Some("D"), Some("E"), None, Some("G"), Some("H")], "Value_PT_2" => &[Some("-1,0"), Some("2.000,5"), Some("3,00"), Some("1."), Some("9.999,99"), Some("123"), None, Some("")], "Value_US" => &[Some("1,234.56"), Some("78.90"), Some("1,000"), Some("-10.0"), Some("500."), None, Some("0.1"), Some("10")], "Mixed_Data" => &[Some(1.0), None::<f64>, None::<f64>, None::<f64>, Some(-1000.5), Some(0.5), None::<f64>, Some(123.45)], "Already_F64" => &[10.1, 20.2, 30.3, 40.4, 50.5, 60.6, 70.7, 80.8] )?
.lazy()
.with_column(col("Mixed_Data").cast(DataType::Float64)) .collect()?;
let regex = "^Mixed_Data$";
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = normalize_float_strings_by_regex(df_input.clone(), regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(&df_output, &df_expected, "Mixed_Data column normalization");
assert_eq!(df_output.schema(), df_expected.schema());
Ok(())
}
#[test]
fn test_normalize_no_matching_columns() -> PolarsViewResult<()> {
let df_input = create_shared_df()?;
let df_expected = df_input.clone();
let regex = "^NonExistent_$";
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = normalize_float_strings_by_regex(df_input.clone(), regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(&df_output, &df_expected, "No matching columns");
assert_eq!(df_output.schema(), df_expected.schema());
Ok(())
}
#[test]
fn test_normalize_error_on_already_float() -> PolarsViewResult<()> {
let df_input = create_shared_df()?;
let regex = "^Already_F64$";
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let result = normalize_float_strings_by_regex(df_input.clone(), regex); println!("Result (expecting error): {result:?}");
assert!(
result.is_err(),
"Expected an error when running on f64 column, but got Ok"
);
if let Err(e) = result {
println!("Got expected error: {e}");
assert!(
e.to_string().contains("str"),
"Error message should indicate string function failure"
);
}
Ok(()) }
#[test]
fn test_normalize_specific_col() -> PolarsViewResult<()> {
println!("--- Test: test_normalize_specific_col ---");
let df_input = df!(
"ID" => &[1, 2, 3],
"Value_EU" => &["1.234,56", "78,90", "100"], "Value_US" => &["1,234.56", "78.90", "100.00"], "Amount" => &[1234.56, 78.90, 100.0] )?;
let regex = "^Value_EU$";
let df_expected = df!(
"ID" => &[1, 2, 3],
"Value_EU" => &[1234.56, 78.90, 100.0], "Value_US" => &["1,234.56", "78.90", "100.00"], "Amount" => &[1234.56, 78.90, 100.0] )?;
println!("Input DF:\n{df_input}");
println!("Regex: {regex}");
let df_output = normalize_float_strings_by_regex(df_input.clone(), regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert!(df_output.equals_missing(&df_expected));
Ok(())
}
#[test]
fn test_normalize_regex_multi_col() -> PolarsViewResult<()> {
println!("--- Test: test_normalize_regex_multi_col ---");
let df_input = df!(
"Product" => &["A", "B"],
"Price_EU" => &["1.000,50", "25,00"],
"Tax_EU" => &["200,10", "5,00"],
"Cost_US" => &["900.00", "20.00"]
)?;
let regex = "^.*_EU$";
let df_expected = df!(
"Product" => &["A", "B"],
"Price_EU" => &[1000.50, 25.0], "Tax_EU" => &[200.10, 5.0], "Cost_US" => &["900.00", "20.00"] )?;
println!("Input DF:\n{df_input}");
println!("Regex: {regex}");
let df_output = normalize_float_strings_by_regex(df_input.clone(), regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert!(df_output.equals_missing(&df_expected));
Ok(())
}
#[test]
fn test_normalize_wildcard() -> PolarsViewResult<()> {
println!("--- Test: test_normalize_wildcard ---");
let df_input_error = df!(
"A_string" => &["1.000,50", "25,00"],
"B_string" => &["200,10", "5,00"],
"C_int" => &[1, 2]
)?;
let regex_error = "*";
println!("Test Case 1: Wildcard with mixed types");
println!("Input DF:\n{df_input_error}");
println!("Regex: {regex_error}");
let result = normalize_float_strings_by_regex(df_input_error.clone(), regex_error);
println!("Result (expecting error): {result:?}");
assert!(matches!(
result,
Err(PolarsViewError::InvalidDataTypeForRegex { .. })
));
println!("--");
let df_only_strings = df!(
"A_string" => &["1.000,50", "25,00"],
"B_string" => &["200,10", "5,00"]
)?;
let regex_ok = "*";
let expected_df = df!(
"A_string" => &[1000.5, 25.0],
"B_string" => &[200.1, 5.0]
)?;
println!("Test Case 2: Wildcard with only string types");
println!("Input DF:\n{df_only_strings}");
println!("Regex: {regex_ok}");
let df_output_ok = normalize_float_strings_by_regex(df_only_strings.clone(), regex_ok)?;
println!("Output DF:\n{df_output_ok}");
println!("Expected DF:\n{expected_df}");
assert!(df_output_ok.equals_missing(&expected_df));
Ok(())
}
#[test]
fn test_error_invalid_regex_pattern_format() -> PolarsViewResult<()> {
println!("--- Test: test_error_invalid_regex_pattern_format ---");
let df_input = df!("col_A" => &["1,23"])?;
let regex = "Value_EU";
println!("Input DF:\n{df_input}");
println!("Regex: {regex}");
let result = normalize_float_strings_by_regex(df_input, regex);
println!("Result (expecting error InvalidRegexPattern): {result:?}");
assert!(matches!(result, Err(PolarsViewError::InvalidRegexPattern(s)) if s == regex));
Ok(())
}
#[test]
fn test_error_invalid_regex_syntax() -> PolarsViewResult<()> {
println!("--- Test: test_error_invalid_regex_syntax ---");
let df_input = df!("col_A" => &["1,23"])?;
let regex = "^Val[ue$";
println!("Input DF:\n{df_input}");
println!("Regex: {regex}");
let result = normalize_float_strings_by_regex(df_input, regex);
println!("Result (expecting error InvalidRegexSyntax): {result:?}");
assert!(matches!(
result,
Err(PolarsViewError::InvalidRegexSyntax { pattern, .. }) if pattern == regex
));
Ok(())
}
#[test]
fn test_error_non_string_column_match() -> PolarsViewResult<()> {
println!("--- Test: test_error_non_string_column_match ---");
let df_input = df!(
"Value_EU" => &["1.000,50"], "Count_EU" => &[1000i64] )?;
let regex = "^.*_EU$";
println!("Input DF:\n{df_input}");
println!("Regex: {regex}");
let result = normalize_float_strings_by_regex(df_input, regex);
println!("Result (expecting error InvalidDataTypeForRegex): {result:?}");
assert!(matches!(
result,
Err(PolarsViewError::InvalidDataTypeForRegex{ pattern, columns })
if pattern == regex && columns.contains(&"'Count_EU' (Type: i64)".to_string())
));
Ok(())
}
#[test]
fn test_empty_dataframe() -> PolarsViewResult<()> {
println!("--- Test: test_empty_dataframe ---");
let df_input = DataFrame::default();
let regex_wildcard = "*";
let regex_pattern = "^.*$";
let df_expected = df_input.clone();
println!("Input DF (empty):\n{df_input}");
println!("Regex: {regex_wildcard}");
let df_output_wild = normalize_float_strings_by_regex(df_input.clone(), regex_wildcard)?;
println!("Output DF (wildcard):\n{df_output_wild}");
println!("Expected DF (empty):\n{df_expected}");
assert!(df_output_wild.equals(&df_expected));
println!("--");
println!("Regex: {regex_pattern}");
let df_output_regex = normalize_float_strings_by_regex(df_input.clone(), regex_pattern)?;
println!("Output DF (regex):\n{df_output_regex}");
println!("Expected DF (empty):\n{df_expected}");
assert!(df_output_regex.equals(&df_expected));
Ok(())
}
}