use crate::DEFAULT_INDEX_COLUMN_NAME;
use polars::prelude::{DataType, Schema};
pub const DEFAULT_QUERY: &str = "\
-- Select all columns and rows
SELECT *
FROM AllData;
";
const COLS_FILTER_OUT: [&str; 14] = [
DEFAULT_INDEX_COLUMN_NAME, "Average",
"Frequency",
"Total",
"As Float",
"Category",
"Calculation Result",
"Row Count",
"Unique Values",
"Minimum",
"Maximum",
"New Name for",
"+ 10%",
"DMY (day/month/year)", ];
fn is_filtered_col(name: &str) -> bool {
let trimmed_name = name.trim();
if trimmed_name.is_empty() {
return true; }
COLS_FILTER_OUT
.iter()
.any(|substring| trimmed_name.contains(substring))
}
fn find_nth_col_name(
schema: &Schema,
n: usize,
target_dtype: impl Fn(&DataType) -> bool,
) -> Option<&str> {
schema
.iter()
.filter(|(name, dtype)| !is_filtered_col(name.as_str()) && target_dtype(dtype))
.nth(n) .map(|(name, _dtype)| name.as_str())
}
fn generate_limit_example(commands: &mut Vec<String>) {
commands.push(
"\
-- Limit the number of rows returned
SELECT *
FROM AllData
LIMIT 50;
"
.to_string(),
);
}
fn generate_select_specific_columns(
commands: &mut Vec<String>,
opt_str_col: Option<&str>,
opt_num_col: Option<&str>, ) {
if let (Some(col1), Some(col2)) = (opt_str_col, opt_num_col) {
commands.push(format!(
"\
-- Select specific columns by name
SELECT
`{col1}`,
`{col2}`
FROM AllData;
"
));
}
}
fn generate_except_example(
commands: &mut Vec<String>,
opt_col1: Option<&str>, opt_col2: Option<&str>, ) {
if let (Some(col1), Some(col2)) = (opt_col1, opt_col2) {
commands.push(format!(
"\
-- Select all columns EXCEPT specific ones
-- Useful for dropping temporary or unwanted columns
SELECT *
EXCEPT (
`{col1}`,
`{col2}`
)
FROM AllData;
"
));
}
}
fn generate_rename_example(
commands: &mut Vec<String>,
opt_col1: Option<&str>, opt_col2: Option<&str>, ) {
if let (Some(col1), Some(col2)) = (opt_col1, opt_col2) {
commands.push(format!(
"\
-- Rename columns while preserving order
SELECT *
RENAME (
`{col1}` AS `New Name for {col1}`,
`{col2}` AS `New Name for {col2}`
)
FROM AllData;
"
));
}
}
fn generate_replace_float_example(commands: &mut Vec<String>, opt_float_col: Option<&str>) {
if let Some(float_col) = opt_float_col {
commands.push(format!(
"\
-- Replace values in a column with a calculation
-- Example: Increase float column `{float_col}` by 10%
SELECT *
REPLACE (
`{float_col}` * 1.1
AS
`{float_col}`
)
FROM AllData;
"
));
}
}
fn generate_add_new_column_example(
commands: &mut Vec<String>,
opt_float_col: Option<&str>,
opt_int_col: Option<&str>,
) {
if let (Some(col_f), Some(col_i)) = (opt_float_col, opt_int_col) {
commands.push(format!(
"\
-- Add a NEW calculated column 'Calculation Result'
-- Selects all original columns PLUS the new one
SELECT *,
`{col_f}` * `{col_i}`
AS
`Calculation Result`
FROM AllData;
"
));
}
}
fn generate_combined_transform_example(
commands: &mut Vec<String>,
opt_col1: Option<&str>, opt_col2: Option<&str>, opt_float_col: Option<&str>,
) {
if let (Some(col1), Some(col2), Some(float_col)) = (opt_col1, opt_col2, opt_float_col) {
commands.push(format!(
"\
-- Except, Replace and Rename
SELECT *
EXCEPT (`{col1}`)
REPLACE (
`{float_col}` * 1.1
AS
`{float_col}`
)
RENAME (
`{float_col}` AS `{float_col} + 10%`,
`{col2}` AS `New Name for {col2}`
)
FROM AllData;
"
));
}
}
fn generate_cast_int_to_float_example(commands: &mut Vec<String>, opt_int_col: Option<&str>) {
if let Some(int_col) = opt_int_col {
commands.push(format!(
"\
-- Explicitly CAST an integer column to FLOAT
SELECT *,
CAST(`{int_col}` AS DOUBLE) AS `{int_col} As Float`
FROM AllData;
"
));
}
}
fn generate_case_when_example(
commands: &mut Vec<String>,
opt_num_col: Option<&str>, ) {
if let Some(num_col) = opt_num_col {
commands.push(format!(
"\
-- Create a new column based on conditions using CASE WHEN
SELECT *,
CASE
WHEN `{num_col}` > 100 THEN 'High'
WHEN `{num_col}` > 10 THEN 'Medium'
ELSE 'Low'
END AS `{num_col} Category`
FROM AllData;
"
));
}
}
fn generate_where_isnull_example(
commands: &mut Vec<String>,
opt_any_col: Option<&str>, ) {
if let Some(col) = opt_any_col {
commands.push(format!(
"\
-- Filter rows where a column IS NULL or IS NOT NULL
SELECT *
FROM AllData
WHERE
`{col}` IS NULL;
-- Replace IS NULL with IS NOT NULL to get non-null rows
"
));
}
}
fn generate_where_int_equality_example(commands: &mut Vec<String>, opt_int_col: Option<&str>) {
if let Some(int_col) = opt_int_col {
commands.push(format!(
"\
-- Filter rows where a numeric column equals a specific value
-- number comparisons: =, >, <, >=, <=, !=
SELECT *
FROM AllData
WHERE
`{int_col}` = 2024;
-- Or filter by an (inclusive) range of values:
-- WHERE `{int_col}` BETWEEN 2020 AND 2024
-- Or filter based on a list of values using IN:
-- WHERE `{int_col}` IN (2020, 2022, 2024)
"
));
}
}
fn generate_where_string_filter_example(commands: &mut Vec<String>, opt_str_col: Option<&str>) {
if let Some(str_col) = opt_str_col {
commands.push(format!(
"\
-- Filter rows based on string content
SELECT *
FROM AllData
WHERE
`{str_col}` = 'Specific Value';
-- Or using LIKE for pattern matching (case-sensitive)
-- WHERE `{str_col}` LIKE 'Prefix%';
-- WHERE `{str_col}` LIKE '%Middle%';
-- WHERE `{str_col}` LIKE '%Suffix';
-- Use ILIKE for case-insensitive matching
-- WHERE `{str_col}` ILIKE 'pattern%';
-- Or filter based on a list of values using IN:
-- WHERE `{str_col}` IN ('Value A', 'Value B')
"
));
}
}
fn generate_where_combined_example(
commands: &mut Vec<String>,
opt_int_col: Option<&str>,
opt_str_col: Option<&str>,
) {
if let (Some(int_col), Some(str_col)) = (opt_int_col, opt_str_col) {
commands.push(format!(
"\
-- Combine multiple filter conditions using AND / OR
SELECT *
FROM AllData
WHERE
`{int_col}` >= 2024
AND
`{str_col}` != 'Exclude This'
-- Or filter based on a list of values using IN:
-- WHERE `{int_col}` IN (2020, 2022, 2024)
-- WHERE `{str_col}` IN ('Value A', 'Value B')
"
));
}
}
fn generate_orderby_example(
commands: &mut Vec<String>,
opt_str_col: Option<&str>,
opt_num_col: Option<&str>, ) {
if let (Some(col1), Some(col2)) = (opt_str_col, opt_num_col) {
commands.push(format!(
"\
-- Sort results by a single column (ASC default, or DESC) or
-- Sort results by multiple columns (precedence based on order)
SELECT *
FROM AllData
ORDER BY
`{col1}` ASC, -- Primary sort key
`{col2}` DESC; -- Secondary sort key
"
));
}
}
fn generate_groupby_count_example(
commands: &mut Vec<String>,
opt_cat_col: Option<&str>, opt_col2: Option<&str>, ) {
if let (Some(col1), Some(col2)) = (opt_cat_col, opt_col2) {
commands.push(format!(
"\
-- Count rows per category (Frequency)
SELECT
`{col1}`,
`{col2}`,
COUNT(*) AS Frequency
FROM AllData
GROUP BY
`{col1}`,
`{col2}`
ORDER BY
Frequency DESC;
"
));
}
}
fn generate_groupby_sum_example(
commands: &mut Vec<String>,
opt_str_col: Option<&str>, opt_num_col: Option<&str>, ) {
if let (Some(cat_col), Some(num_col)) = (opt_str_col, opt_num_col) {
commands.push(format!(
"\
-- Calculate SUM of a numeric column per category
SELECT
`{cat_col}`,
SUM(`{num_col}`) AS `{num_col} Total`
FROM AllData
GROUP BY
`{cat_col}`
ORDER BY
`{num_col} Total` DESC;
"
));
}
}
fn generate_groupby_multiple_aggregates_example(
commands: &mut Vec<String>,
opt_str_col: Option<&str>, opt_val_col: Option<&str>, ) {
if let (Some(cat_col), Some(val_col)) = (opt_str_col, opt_val_col) {
commands.push(format!(
"\
-- Calculate multiple aggregate functions per category
SELECT
`{cat_col}`,
-- Total rows per category
COUNT(*) AS `Row Count`,
-- Count unique values in `{val_col}`:
COUNT(DISTINCT `{val_col}`) AS `Unique Values`,
SUM(`{val_col}`) AS Total, -- Sum of val_col
AVG(`{val_col}`) AS Average, -- Average of val_col
MIN(`{val_col}`) AS Minimum, -- Minimum value
MAX(`{val_col}`) AS Maximum -- Maximum value
FROM AllData
GROUP BY
`{cat_col}`
ORDER BY
`Row Count` DESC;
"
));
}
}
fn generate_having_example(
commands: &mut Vec<String>,
opt_str_col: Option<&str>, opt_val_col: Option<&str>, ) {
if let (Some(cat_col), Some(val_col)) = (opt_str_col, opt_val_col) {
commands.push(format!(
"\
-- Use HAVING to filter groups based on aggregate results
SELECT
`{cat_col}`,
AVG(`{val_col}`) AS `{val_col} Average`
FROM AllData
GROUP BY
`{cat_col}`
HAVING
-- Only show categories where the average > 1000
`{val_col} Average` > 1000
ORDER BY
`{val_col} Average` DESC;
"
));
}
}
fn generate_strftime_example(commands: &mut Vec<String>, opt_date_col: Option<&str>) {
if let Some(date_col) = opt_date_col {
commands.push(format!(
"\
-- Format a date column into DD/MM/YYYY using STRFTIME
SELECT *, -- Select all original columns
STRFTIME(`{date_col}`, '%d/%m/%Y') AS `DMY (day/month/year)`
FROM AllData
ORDER BY
`{date_col}`;
"
));
}
}
fn generate_distinct_example(
commands: &mut Vec<String>,
opt_col1: Option<&str>, opt_col2: Option<&str>, ) {
if let (Some(col1), Some(col2)) = (opt_col1, opt_col2) {
commands.push(format!(
"\
-- Show distinct values of columns
SELECT DISTINCT
`{col1}`,
`{col2}`
FROM AllData
ORDER BY
`{col2}`;
"
));
}
}
fn replace_cell_value(
commands: &mut Vec<String>,
opt_int_col: Option<&str>,
opt_str_col: Option<&str>,
) {
match (opt_int_col, opt_str_col) {
(Some(int_col), _) => {
commands.push(format!(
"\
-- Replaces cell values based on a condition using CASE WHEN in REPLACE
SELECT *
REPLACE (
CASE
-- Condition to identify the specific row (example uses an integer column)
WHEN `{int_col}` >= 10
-- The new value for cells (some integer or string value)
THEN 'New String Value'
-- Keep the original values for all other rows in this column
ELSE `{int_col}`
-- Apply the result back to the original column name or another column
END AS `{int_col}`
)
FROM AllData;
"
));
}
(None, Some(str_col)) => {
commands.push(format!(
"\
-- Replaces cell values based on a condition using CASE WHEN in REPLACE
SELECT *
REPLACE (
CASE
-- Condition to identify the specific row (example uses a string column with ILIKE)
WHEN `{str_col}` ILIKE '%pattern%'
-- The new value for cells (some integer or string value)
THEN 'New String Value'
-- Keep the original values for all other rows in this column
ELSE `{str_col}`
-- Apply the result back to the original column name or another column
END AS `{str_col}`
)
FROM AllData;
"
));
}
(None, None) => (), }
}
pub fn sql_commands(schema: &Schema) -> Vec<String> {
let mut commands: Vec<String> = vec![DEFAULT_QUERY.to_string()];
let is_any = |dtype: &DataType| !dtype.is_null();
let opt_str_col = find_nth_col_name(schema, 0, |dtype| dtype.is_string());
let opt_int_col = find_nth_col_name(schema, 0, |dtype| dtype.is_integer());
let opt_float_col = find_nth_col_name(schema, 0, |dtype| dtype.is_float());
let opt_date_col = find_nth_col_name(schema, 0, |dtype| dtype.is_date());
let opt_any_col = find_nth_col_name(schema, 0, is_any); let opt_any_col_1 = find_nth_col_name(schema, 1, is_any); let opt_any_col_2 = find_nth_col_name(schema, 2, is_any);
let opt_int_or_float_col = opt_int_col.or(opt_float_col);
let opt_date_int_str_or_any_col = opt_date_col.or(opt_int_col).or(opt_str_col).or(opt_any_col);
generate_limit_example(&mut commands);
generate_select_specific_columns(&mut commands, opt_str_col, opt_int_or_float_col);
generate_except_example(&mut commands, opt_any_col, opt_any_col_2); generate_rename_example(&mut commands, opt_any_col, opt_any_col_1); generate_replace_float_example(&mut commands, opt_float_col);
replace_cell_value(&mut commands, opt_int_col, opt_str_col); generate_add_new_column_example(&mut commands, opt_float_col, opt_int_col);
generate_combined_transform_example(&mut commands, opt_any_col, opt_any_col_1, opt_float_col);
generate_cast_int_to_float_example(&mut commands, opt_int_col);
generate_case_when_example(&mut commands, opt_int_or_float_col);
generate_where_isnull_example(&mut commands, opt_any_col);
generate_where_int_equality_example(&mut commands, opt_int_col);
generate_where_string_filter_example(&mut commands, opt_str_col);
generate_where_combined_example(&mut commands, opt_int_col, opt_str_col);
generate_orderby_example(&mut commands, opt_str_col, opt_int_or_float_col);
generate_groupby_count_example(&mut commands, opt_date_int_str_or_any_col, opt_any_col_2); generate_groupby_sum_example(&mut commands, opt_str_col, opt_int_or_float_col); generate_groupby_multiple_aggregates_example(&mut commands, opt_str_col, opt_int_or_float_col); generate_having_example(&mut commands, opt_str_col, opt_int_or_float_col);
generate_strftime_example(&mut commands, opt_date_col);
generate_distinct_example(&mut commands, opt_any_col, opt_any_col_2);
commands
}