use super::types::{get_quoted_column_name, CLOSED_VALUES, POSITION_VALUES};
use super::{
DefaultAesthetics, DefaultParamValue, GeomTrait, GeomType, ParamConstraint, ParamDefinition,
StatResult,
};
use crate::naming;
use crate::plot::types::{DefaultAestheticValue, ParameterValue, Parameters};
use crate::reader::SqlDialect;
use crate::{DataFrame, GgsqlError, Mappings, Result};
use super::types::Schema;
#[derive(Debug, Clone, Copy)]
pub struct Histogram;
impl GeomTrait for Histogram {
fn geom_type(&self) -> GeomType {
GeomType::Histogram
}
fn aesthetics(&self) -> DefaultAesthetics {
DefaultAesthetics {
defaults: &[
("pos1", DefaultAestheticValue::Required),
("weight", DefaultAestheticValue::Null),
("fill", DefaultAestheticValue::String("black")),
("stroke", DefaultAestheticValue::String("black")),
("opacity", DefaultAestheticValue::Number(0.8)),
("pos2", DefaultAestheticValue::Delayed),
("pos1end", DefaultAestheticValue::Delayed),
("pos2end", DefaultAestheticValue::Delayed), ],
}
}
fn default_remappings(&self) -> DefaultAesthetics {
DefaultAesthetics {
defaults: &[
("pos1", DefaultAestheticValue::Column("bin")),
("pos1end", DefaultAestheticValue::Column("bin_end")),
("pos2", DefaultAestheticValue::Column("count")),
("pos2end", DefaultAestheticValue::Number(0.0)),
],
}
}
fn valid_stat_columns(&self) -> &'static [&'static str] {
&["bin", "bin_end", "count", "density"]
}
fn default_params(&self) -> &'static [ParamDefinition] {
const PARAMS: &[ParamDefinition] = &[
ParamDefinition {
name: "bins",
default: DefaultParamValue::Number(30.0),
constraint: ParamConstraint::count(1.0),
},
ParamDefinition {
name: "closed",
default: DefaultParamValue::String("right"),
constraint: ParamConstraint::string_option(CLOSED_VALUES),
},
ParamDefinition {
name: "binwidth",
default: DefaultParamValue::Null,
constraint: ParamConstraint::number_min_exclusive(0.0),
},
ParamDefinition {
name: "position",
default: DefaultParamValue::String("stack"),
constraint: ParamConstraint::string_option(POSITION_VALUES),
},
];
PARAMS
}
fn stat_consumed_aesthetics(&self) -> &'static [&'static str] {
&["pos1"]
}
fn apply_stat_transform(
&self,
query: &str,
_schema: &Schema,
aesthetics: &Mappings,
group_by: &[String],
parameters: &Parameters,
execute_query: &dyn Fn(&str) -> Result<DataFrame>,
dialect: &dyn SqlDialect,
aesthetic_ctx: &crate::plot::aesthetic::AestheticContext,
) -> Result<StatResult> {
stat_histogram(
query,
aesthetics,
group_by,
parameters,
execute_query,
dialect,
aesthetic_ctx,
)
}
}
impl std::fmt::Display for Histogram {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "histogram")
}
}
fn stat_histogram(
query: &str,
aesthetics: &Mappings,
group_by: &[String],
parameters: &Parameters,
execute_query: &dyn Fn(&str) -> Result<DataFrame>,
dialect: &dyn SqlDialect,
aesthetic_ctx: &crate::plot::aesthetic::AestheticContext,
) -> Result<StatResult> {
let x_col = get_quoted_column_name(aesthetics, "pos1").ok_or_else(|| {
let name = aesthetic_ctx.map_internal_to_user("pos1");
GgsqlError::ValidationError(format!("Histogram requires '{}' aesthetic mapping", name))
})?;
let ParameterValue::Number(bins) = parameters.get("bins").unwrap() else {
unreachable!("bins validated by ParamConstraint::count")
};
let bins = *bins as usize;
let ParameterValue::String(closed) = parameters.get("closed").unwrap() else {
unreachable!("closed validated by ParamConstraint::string_option")
};
let closed = closed.as_str();
let explicit_binwidth = parameters.get("binwidth").and_then(|p| match p {
ParameterValue::Number(n) => Some(*n),
_ => None,
});
let stats_query = format!(
"SELECT MIN({x}) as min_val, MAX({x}) as max_val FROM ({query}) AS \"__ggsql_stats__\"",
x = x_col,
query = query
);
let stats_df = execute_query(&stats_query)?;
let (min_val, max_val) = extract_histogram_min_max(&stats_df)?;
let bin_width = if let Some(bw) = explicit_binwidth {
bw
} else if min_val >= max_val {
1.0 } else {
((max_val - min_val) / (bins - 1) as f64 * 1e10).round() / 1e10
};
let min_val = (min_val * 1e10).round() / 1e10;
let bin_expr = if closed == "left" {
format!(
"(FLOOR(({x} - {min} + {w} * 0.5) / {w})) * {w} + {min} - {w} * 0.5",
x = x_col,
min = min_val,
w = bin_width
)
} else {
let ceil_expr = format!(
"CEIL(({x} - {min} + {w} * 0.5) / {w}) - 1",
x = x_col,
min = min_val,
w = bin_width
);
let clamped = dialect.sql_greatest(&["0", &ceil_expr]);
format!(
"({clamped}) * {w} + {min} - {w} * 0.5",
clamped = clamped,
w = bin_width,
min = min_val
)
};
let bin_end_expr = format!("{expr} + {w}", expr = bin_expr, w = bin_width);
let group_cols = if group_by.is_empty() {
bin_expr.clone()
} else {
let mut cols: Vec<String> = group_by.to_vec();
cols.push(bin_expr.clone());
cols.join(", ")
};
let agg_expr = if let Some(weight_value) = aesthetics.get("weight") {
if weight_value.is_literal() {
return Err(GgsqlError::ValidationError(
"Histogram weight aesthetic must be a column, not a literal".to_string(),
));
}
if let Some(weight_col) = weight_value.column_name() {
format!("SUM({})", naming::quote_ident(weight_col))
} else {
"COUNT(*)".to_string()
}
} else {
"COUNT(*)".to_string()
};
let stat_bin = naming::stat_column("bin");
let stat_bin_end = naming::stat_column("bin_end");
let stat_count = naming::stat_column("count");
let stat_density = naming::stat_column("density");
let q_bin = naming::quote_ident(&stat_bin);
let q_bin_end = naming::quote_ident(&stat_bin_end);
let q_count = naming::quote_ident(&stat_count);
let q_density = naming::quote_ident(&stat_density);
let (binned_select, final_select) = if group_by.is_empty() {
(
format!(
"{} AS {}, {} AS {}, {} AS {}",
bin_expr, q_bin, bin_end_expr, q_bin_end, agg_expr, q_count
),
format!(
"*, {count} * 1.0 / SUM({count}) OVER () AS {density}",
count = q_count,
density = q_density
),
)
} else {
let grp_cols = group_by.join(", ");
(
format!(
"{}, {} AS {}, {} AS {}, {} AS {}",
grp_cols, bin_expr, q_bin, bin_end_expr, q_bin_end, agg_expr, q_count
),
format!(
"*, {count} * 1.0 / SUM({count}) OVER (PARTITION BY {grp}) AS {density}",
count = q_count,
grp = grp_cols,
density = q_density
),
)
};
let transformed_query = format!(
"WITH \"__stat_src__\" AS ({query}), \"__binned__\" AS (SELECT {binned} FROM \"__stat_src__\" GROUP BY {group}) SELECT {final} FROM \"__binned__\"",
query = query,
binned = binned_select,
group = group_cols,
final = final_select
);
Ok(StatResult::Transformed {
query: transformed_query,
stat_columns: vec![
"bin".to_string(),
"bin_end".to_string(),
"count".to_string(),
"density".to_string(),
],
dummy_columns: vec![],
consumed_aesthetics: vec!["pos1".to_string(), "weight".to_string()],
})
}
pub fn extract_histogram_min_max(df: &DataFrame) -> Result<(f64, f64)> {
if df.height() == 0 {
return Err(GgsqlError::ValidationError(
"No data for histogram statistics".to_string(),
));
}
let extract = |name: &str| -> Option<f64> {
use arrow::array::Array;
use arrow::datatypes::DataType;
let col = df.column(name).ok()?;
if col.is_null(0) {
return None;
}
let casted = crate::array_util::cast_array(col, &DataType::Float64).ok()?;
crate::array_util::as_f64(&casted).ok().map(|a| a.value(0))
};
let min_val = extract("min_val").ok_or_else(|| {
GgsqlError::ValidationError("Could not extract min value for histogram".to_string())
})?;
let max_val = extract("max_val").ok_or_else(|| {
GgsqlError::ValidationError("Could not extract max value for histogram".to_string())
})?;
Ok((min_val, max_val))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::df;
#[test]
fn test_extract_min_max_null_errors() {
let df = df! {
"min_val" => vec![None::<f64>],
"max_val" => vec![None::<f64>],
}
.unwrap();
assert!(extract_histogram_min_max(&df).is_err());
}
}