use crate::{PolarsViewError, PolarsViewResult};
use polars::prelude::*;
use regex::Regex;
pub fn drop_columns_by_regex(df: DataFrame, regex_pattern: &str) -> PolarsViewResult<DataFrame> {
let compiled_regex: Option<Regex> = match regex_pattern {
"*" => None,
pattern => {
if pattern.starts_with('^') && pattern.ends_with('$') {
match Regex::new(pattern) {
Ok(re) => Some(re),
Err(e) => {
return Err(PolarsViewError::InvalidRegexSyntax {
pattern: pattern.to_string(),
error: e.to_string(),
});
}
}
} else {
return Err(PolarsViewError::InvalidRegexPattern(pattern.to_string()));
}
}
};
tracing::debug!("Compiled regex (None for wildcard): {:?}", compiled_regex);
let columns_to_drop: Vec<PlSmallStr> = df .get_column_names_owned()
.into_iter() .filter(|col_name| {
match &compiled_regex {
Some(re) => re.is_match(col_name), None => true, }
})
.collect();
match (columns_to_drop.len(), df.width()) {
(0, _) => {
tracing::debug!(
"No columns matching regex '{}' found to drop. Returning original DataFrame.",
regex_pattern
);
Ok(df)
}
(n, total) if n == total => {
tracing::debug!("All columns matched. Returning empty 0x0 DataFrame.");
Ok(DataFrame::empty())
}
(n, _) => {
tracing::debug!("Dropping {} columns out of {}.", n, df.width());
tracing::debug!("Dropping columns: {:?}", columns_to_drop);
Ok(df.drop_many(columns_to_drop))
}
}
}
#[cfg(test)]
mod tests_drop_cols {
use super::*; use crate::PolarsViewError;
use polars::df;
fn assert_df_equal(df_output: &DataFrame, df_expected: &DataFrame, context: &str) {
assert!(
df_output.equals_missing(df_expected),
"\nAssertion Failed: {}\n\nOutput DF:\n{}\nSchema: {:?}\n\nExpected DF:\n{}\nSchema: {:?}\n",
context,
df_output,
df_output.schema(),
df_expected,
df_expected.schema()
);
}
fn create_shared_test_df() -> PolarsViewResult<DataFrame> {
df!(
"ID" => &[1, 2],
"Value_A" => &["apple", "banana"],
"Description B" => &[Some("desc 1"), None], "Value_C" => &[10.1, 20.2],
"IgnoreMe" => &[true, false]
)
.map_err(PolarsViewError::from)
}
#[test]
fn test_drop_single_column() -> PolarsViewResult<()> {
let df_input = create_shared_test_df()?;
let regex = r#"^Description B$"#;
let df_expected = df!(
"ID" => &[1, 2],
"Value_A" => &["apple", "banana"],
"Value_C" => &[10.1, 20.2],
"IgnoreMe" => &[true, false]
)?;
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = drop_columns_by_regex(df_input, regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(
&df_output,
&df_expected,
"Drop single column 'Description B'",
);
assert_eq!(df_output.width(), 4); Ok(())
}
#[test]
fn test_drop_multiple_columns_pattern() -> PolarsViewResult<()> {
let df_input = create_shared_test_df()?;
let regex = r#"^Value_.*$"#;
let df_expected = df!(
"ID" => &[1, 2],
"Description B" => &[Some("desc 1"), None],
"IgnoreMe" => &[true, false]
)?;
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = drop_columns_by_regex(df_input, regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(
&df_output,
&df_expected,
"Drop columns matching '^Value_.*$'",
);
assert_eq!(df_output.width(), 3);
Ok(())
}
#[test]
fn test_drop_all_columns_wildcard() -> PolarsViewResult<()> {
let df_input = create_shared_test_df()?;
let regex = "*";
let df_expected = DataFrame::empty();
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = drop_columns_by_regex(df_input, regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_eq!(df_output.width(), 0, "Wildcard should drop all columns");
assert_eq!(
df_output.height(),
0,
"Dropping all columns via lazy drop results in 0 height"
);
assert_df_equal(&df_output, &df_expected, "Drop all columns using wildcard");
Ok(())
}
#[test]
fn test_drop_no_matching_columns() -> PolarsViewResult<()> {
let df_input = create_shared_test_df()?;
let regex = r#"^NonExistent$"#;
let df_expected = df_input.clone();
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = drop_columns_by_regex(df_input, regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(&df_output, &df_expected, "No matching columns to drop");
assert_eq!(df_output.width(), 5);
Ok(())
}
#[test]
fn test_invalid_regex_syntax() {
let df_input = create_shared_test_df().unwrap();
let regex = r#"^Value[.*$"#;
let result = drop_columns_by_regex(df_input, regex);
assert!(
result.is_err(),
"Expected an error for invalid regex syntax"
);
assert!(
matches!(
result.as_ref().unwrap_err(),
PolarsViewError::InvalidRegexSyntax { pattern, error: _ } if pattern == regex
),
"Expected InvalidRegexSyntax error, got {result:?}"
);
}
#[test]
fn test_invalid_regex_format() {
let df_input = create_shared_test_df().unwrap();
let regex = "Value_.*";
let result = drop_columns_by_regex(df_input, regex);
assert!(
result.is_err(),
"Expected an error for invalid regex format"
);
assert!(
matches!(
result.as_ref().unwrap_err(),
PolarsViewError::InvalidRegexPattern(pattern) if pattern == regex
),
"Expected InvalidRegexPattern error, got {result:?}"
);
}
#[test]
fn test_regex_on_empty_df() -> PolarsViewResult<()> {
let df_input = df!(
"A" => Vec::<i32>::new(),
"B" => Vec::<String>::new(),
"AA" => Vec::<u64>::new(),
)?;
println!("Input DF:\n{df_input}");
let regex = r#"^A+$"#; println!("regex:{regex}");
let df_output = drop_columns_by_regex(df_input.clone(), regex)?;
let df_expected = df!(
"B" => Vec::<String>::new()
)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(&df_output, &df_expected, "Empty input DataFrame");
let first_col = df_output.columns().first();
let second_col = df_input.columns().get(1);
assert_eq!(first_col, second_col);
Ok(())
}
#[test]
fn test_wildcard_on_empty_df() -> PolarsViewResult<()> {
let df_input = df!(
"A" => Vec::<i32>::new(),
"B" => Vec::<String>::new()
)?;
let regex = "*";
let df_expected = DataFrame::empty();
println!("Input DF:\n{df_input}");
println!("regex:{regex}");
let df_output = drop_columns_by_regex(df_input, regex)?;
println!("Output DF:\n{df_output}");
println!("Expected DF:\n{df_expected}");
assert_df_equal(&df_output, &df_expected, "Wildcard on empty DataFrame");
assert_eq!(df_output.width(), 0);
assert_eq!(df_output.height(), 0);
assert!(df_output.columns().is_empty()); Ok(())
}
}