use crate::naming;
use crate::plot::aesthetic::AestheticContext;
use crate::plot::scale::{
default_oob, gets_default_scale, infer_scale_target_type, infer_transform_from_input_range,
is_facet_aesthetic, transform::Transform, OOB_CENSOR, OOB_KEEP, OOB_SQUISH,
};
use crate::plot::{
AestheticValue, ArrayElement, ArrayElementType, ColumnInfo, Layer, ParameterValue, Plot, Scale,
ScaleType, ScaleTypeKind, Schema,
};
use crate::{DataFrame, GgsqlError, Result};
use arrow::array::ArrayRef;
use std::collections::{HashMap, HashSet};
use super::schema::TypeInfo;
pub fn create_missing_scales(spec: &mut Plot) {
let aesthetic_ctx = spec.get_aesthetic_context();
let mut used_aesthetics: HashSet<String> = HashSet::new();
for layer in &spec.layers {
for aesthetic in layer.mappings.aesthetics.keys() {
let primary = aesthetic_ctx
.primary_internal_position(aesthetic)
.unwrap_or(aesthetic);
used_aesthetics.insert(primary.to_string());
}
for aesthetic in layer.remappings.aesthetics.keys() {
let primary = aesthetic_ctx
.primary_internal_position(aesthetic)
.unwrap_or(aesthetic);
used_aesthetics.insert(primary.to_string());
}
}
let existing_scales: HashSet<String> =
spec.scales.iter().map(|s| s.aesthetic.clone()).collect();
for aesthetic in used_aesthetics {
if !existing_scales.contains(&aesthetic) {
let mut scale = Scale::new(&aesthetic);
if !gets_default_scale(&aesthetic) {
scale.scale_type = Some(ScaleType::identity());
}
spec.scales.push(scale);
}
}
}
pub fn create_missing_scales_post_stat(
spec: &mut Plot,
data_map: &HashMap<String, DataFrame>,
) -> Result<()> {
let aesthetic_ctx = spec.get_aesthetic_context();
let mut current_aesthetics: HashSet<String> = HashSet::new();
for layer in &spec.layers {
for aesthetic in layer.mappings.aesthetics.keys() {
let primary = aesthetic_ctx
.primary_internal_position(aesthetic)
.unwrap_or(aesthetic);
current_aesthetics.insert(primary.to_string());
}
}
let existing_scales: HashSet<String> =
spec.scales.iter().map(|s| s.aesthetic.clone()).collect();
for aesthetic in current_aesthetics {
if !existing_scales.contains(&aesthetic) {
let mut scale = Scale::new(&aesthetic);
if !gets_default_scale(&aesthetic) {
scale.scale_type = Some(ScaleType::identity());
}
spec.scales.push(scale);
}
}
for scale in &mut spec.scales {
if scale.scale_type.is_none() && gets_default_scale(&scale.aesthetic) {
let column_refs = find_columns_for_aesthetic(
&spec.layers,
&scale.aesthetic,
data_map,
&aesthetic_ctx,
);
if !column_refs.is_empty() {
scale.scale_type = Some(ScaleType::infer_for_aesthetic(
column_refs[0].data_type(),
&scale.aesthetic,
));
}
}
}
Ok(())
}
pub fn apply_post_stat_binning(
spec: &Plot,
data_map: &mut HashMap<String, DataFrame>,
) -> Result<()> {
let aesthetic_ctx = spec.get_aesthetic_context();
let targeted_per_layer: Vec<HashSet<String>> = spec
.layers
.iter()
.map(|layer| {
crate::plot::layer::geom::stat_aggregate::targeted_aesthetics(
&layer.parameters,
&layer.mappings,
&aesthetic_ctx,
)
})
.collect();
for scale in &spec.scales {
match &scale.scale_type {
Some(st) if st.scale_type_kind() == ScaleTypeKind::Binned => {}
_ => continue,
}
let breaks = match scale.properties.get("breaks") {
Some(ParameterValue::Array(arr)) if arr.len() >= 2 => arr,
_ => continue,
};
let break_values: Vec<f64> = breaks.iter().filter_map(|e| e.to_f64()).collect();
if break_values.len() < 2 {
continue;
}
let closed_left = match scale.properties.get("closed") {
Some(ParameterValue::String(s)) => s != "right",
_ => true,
};
let aesthetics_to_check = aesthetic_ctx
.internal_position_family(&scale.aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![scale.aesthetic.clone()]);
for (idx, layer) in spec.layers.iter().enumerate() {
let data_key = naming::layer_key(idx);
if !data_map.contains_key(&data_key) {
continue;
}
for aes_name in &aesthetics_to_check {
let col_name = match layer.mappings.get(aes_name) {
Some(crate::AestheticValue::Column { name, .. }) => name.clone(),
_ => continue,
};
let df = match data_map.get(&data_key) {
Some(d) => d,
None => continue,
};
if df.column(&col_name).is_err() {
continue;
}
if naming::is_aesthetic_column(&col_name)
&& !targeted_per_layer[idx].contains(aes_name)
{
continue;
}
let binned_df =
apply_binning_to_dataframe(df, &col_name, &break_values, closed_left)?;
data_map.insert(data_key.clone(), binned_df);
}
}
}
Ok(())
}
pub fn apply_binning_to_dataframe(
df: &DataFrame,
col_name: &str,
break_values: &[f64],
closed_left: bool,
) -> Result<DataFrame> {
use crate::array_util::{as_f64, cast_array, new_f64_array};
use arrow::array::Array;
use arrow::datatypes::DataType;
let column = df.column(col_name)?;
let float_col = cast_array(column, &DataType::Float64).map_err(|e| {
GgsqlError::InternalError(format!("Cannot bin column '{}': {}", col_name, e))
})?;
let f64_arr = as_f64(&float_col)?;
let num_bins = break_values.len() - 1;
let binned: Vec<Option<f64>> = (0..f64_arr.len())
.map(|idx| {
if f64_arr.is_null(idx) {
return None;
}
let val = f64_arr.value(idx);
for i in 0..num_bins {
let lower = break_values[i];
let upper = break_values[i + 1];
let is_last = i == num_bins - 1;
let in_bin = if closed_left {
if is_last {
val >= lower && val <= upper
} else {
val >= lower && val < upper
}
} else if i == 0 {
val >= lower && val <= upper
} else {
val > lower && val <= upper
};
if in_bin {
return Some((lower + upper) / 2.0);
}
}
Some(f64::NAN) })
.collect();
let binned_array = new_f64_array(binned);
df.with_column(col_name, binned_array)
.map_err(|e| GgsqlError::InternalError(format!("Failed to replace column: {}", e)))
}
pub fn resolve_scale_types_and_transforms(
spec: &mut Plot,
layer_type_info: &[Vec<TypeInfo>],
) -> Result<()> {
use crate::plot::scale::coerce_dtypes;
let aesthetic_ctx = spec.get_aesthetic_context();
for scale in &mut spec.scales {
if let Some(scale_type) = &scale.scale_type {
let display_aes = aesthetic_ctx.map_internal_to_user(&scale.aesthetic);
if is_facet_aesthetic(&scale.aesthetic)
&& scale_type.scale_type_kind() == ScaleTypeKind::Continuous
{
return Err(GgsqlError::ValidationError(format!(
"SCALE {}: facet variables require Discrete or Binned scales, got Continuous. \
Use SCALE BINNED {} to bin continuous data.",
display_aes, display_aes
)));
}
let all_dtypes = collect_dtypes_for_aesthetic(
&spec.layers,
&scale.aesthetic,
layer_type_info,
&aesthetic_ctx,
);
if !all_dtypes.is_empty() {
if let Ok(common_dtype) = coerce_dtypes(&all_dtypes) {
scale_type.validate_dtype(&common_dtype).map_err(|e| {
GgsqlError::ValidationError(format!("Scale '{}': {}", display_aes, e))
})?;
if scale.transform.is_none() && !scale.explicit_transform {
let transform_kind = if matches!(
scale_type.scale_type_kind(),
ScaleTypeKind::Discrete | ScaleTypeKind::Ordinal
) {
if let Some(ref input_range) = scale.input_range {
if let Some(kind) = infer_transform_from_input_range(input_range) {
kind
} else {
scale_type
.default_transform(&scale.aesthetic, Some(&common_dtype))
}
} else {
scale_type.default_transform(&scale.aesthetic, Some(&common_dtype))
}
} else {
scale_type.default_transform(&scale.aesthetic, Some(&common_dtype))
};
scale.transform = Some(Transform::from_kind(transform_kind));
}
}
}
continue;
}
let all_dtypes = collect_dtypes_for_aesthetic(
&spec.layers,
&scale.aesthetic,
layer_type_info,
&aesthetic_ctx,
);
if all_dtypes.is_empty() {
continue;
}
let common_dtype = match coerce_dtypes(&all_dtypes) {
Ok(dt) => dt,
Err(e) => {
return Err(GgsqlError::ValidationError(format!(
"Scale '{}': {}",
scale.aesthetic, e
)));
}
};
let inferred_scale_type = if scale.explicit_transform {
if let Some(ref transform) = scale.transform {
use crate::plot::scale::TransformKind;
match transform.transform_kind() {
TransformKind::Date
| TransformKind::DateTime
| TransformKind::Time
| TransformKind::Log10
| TransformKind::Log2
| TransformKind::Log
| TransformKind::Sqrt
| TransformKind::Square
| TransformKind::Exp10
| TransformKind::Exp2
| TransformKind::Exp
| TransformKind::Asinh
| TransformKind::PseudoLog
| TransformKind::Integer => ScaleType::continuous(),
TransformKind::String | TransformKind::Bool => ScaleType::discrete(),
TransformKind::Identity => {
ScaleType::infer_for_aesthetic(&common_dtype, &scale.aesthetic)
}
}
} else {
ScaleType::infer_for_aesthetic(&common_dtype, &scale.aesthetic)
}
} else {
ScaleType::infer_for_aesthetic(&common_dtype, &scale.aesthetic)
};
scale.scale_type = Some(inferred_scale_type.clone());
if scale.transform.is_none() && !scale.explicit_transform {
let transform_kind = if inferred_scale_type.scale_type_kind() == ScaleTypeKind::Discrete
{
if let Some(ref input_range) = scale.input_range {
if let Some(kind) = infer_transform_from_input_range(input_range) {
kind
} else {
inferred_scale_type.default_transform(&scale.aesthetic, Some(&common_dtype))
}
} else {
inferred_scale_type.default_transform(&scale.aesthetic, Some(&common_dtype))
}
} else {
inferred_scale_type.default_transform(&scale.aesthetic, Some(&common_dtype))
};
scale.transform = Some(Transform::from_kind(transform_kind));
}
}
Ok(())
}
pub fn collect_dtypes_for_aesthetic(
layers: &[Layer],
aesthetic: &str,
layer_type_info: &[Vec<TypeInfo>],
aesthetic_ctx: &AestheticContext,
) -> Vec<arrow::datatypes::DataType> {
let mut dtypes = Vec::new();
let aesthetics_to_check = aesthetic_ctx
.internal_position_family(aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![aesthetic.to_string()]);
for (layer_idx, layer) in layers.iter().enumerate() {
if layer_idx >= layer_type_info.len() {
continue;
}
let type_info = &layer_type_info[layer_idx];
for aes_name in &aesthetics_to_check {
if let Some(value) = layer.mappings.get(aes_name) {
if let Some(col_name) = value.column_name() {
if let Some((_, dtype, _)) = type_info.iter().find(|(n, _, _)| n == col_name) {
dtypes.push(dtype.clone());
}
}
}
}
}
dtypes
}
pub fn apply_pre_stat_resolve(spec: &mut Plot, layer_schemas: &[Schema]) -> Result<()> {
use crate::plot::scale::ScaleDataContext;
let aesthetic_ctx = spec.get_aesthetic_context();
let mut targeted_in_any_layer: HashSet<String> = HashSet::new();
for layer in &spec.layers {
for aes in crate::plot::layer::geom::stat_aggregate::targeted_aesthetics(
&layer.parameters,
&layer.mappings,
&aesthetic_ctx,
) {
targeted_in_any_layer.insert(aes);
}
}
for scale in &mut spec.scales {
let scale_type = match &scale.scale_type {
Some(st) if st.scale_type_kind() == ScaleTypeKind::Binned => st.clone(),
_ => continue,
};
if targeted_in_any_layer.contains(&scale.aesthetic) {
continue;
}
let column_infos = find_schema_columns_for_aesthetic(
&spec.layers,
&scale.aesthetic,
layer_schemas,
&aesthetic_ctx,
);
if column_infos.is_empty() {
continue;
}
let context = ScaleDataContext::from_schemas(&column_infos);
let display_aes = aesthetic_ctx.map_internal_to_user(&scale.aesthetic);
scale_type
.resolve(scale, &context, &scale.aesthetic.clone())
.map_err(|e| GgsqlError::ValidationError(format!("Scale '{}': {}", display_aes, e)))?;
}
Ok(())
}
pub fn find_schema_columns_for_aesthetic(
layers: &[Layer],
aesthetic: &str,
layer_schemas: &[Schema],
aesthetic_ctx: &AestheticContext,
) -> Vec<ColumnInfo> {
let mut infos = Vec::new();
let aesthetics_to_check = aesthetic_ctx
.internal_position_family(aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![aesthetic.to_string()]);
for (layer_idx, layer) in layers.iter().enumerate() {
if layer_idx >= layer_schemas.len() {
continue;
}
let schema = &layer_schemas[layer_idx];
for aes_name in &aesthetics_to_check {
if let Some(value) = layer.mappings.get(aes_name) {
match value {
AestheticValue::Column { name, .. }
| AestheticValue::AnnotationColumn { name } => {
if let Some(info) = schema.iter().find(|c| c.name == *name) {
infos.push(info.clone());
}
}
AestheticValue::Literal(lit) => {
if let Some(info) = column_info_from_literal(aes_name, lit) {
infos.push(info);
}
}
}
}
}
}
infos
}
pub fn column_info_from_literal(aesthetic: &str, lit: &ParameterValue) -> Option<ColumnInfo> {
use arrow::datatypes::DataType;
match lit {
ParameterValue::Number(n) => Some(ColumnInfo {
name: naming::const_column(aesthetic),
dtype: DataType::Float64,
is_discrete: false,
min: Some(ArrayElement::Number(*n)),
max: Some(ArrayElement::Number(*n)),
}),
ParameterValue::String(s) => Some(ColumnInfo {
name: naming::const_column(aesthetic),
dtype: DataType::Utf8,
is_discrete: true,
min: Some(ArrayElement::String(s.clone())),
max: Some(ArrayElement::String(s.clone())),
}),
ParameterValue::Boolean(_) => {
None
}
ParameterValue::Array(_) | ParameterValue::Null => {
unreachable!("Grammar prevents arrays and null in literal aesthetic mappings")
}
}
}
pub fn coerce_column_to_type(
df: &DataFrame,
column_name: &str,
target_type: ArrayElementType,
) -> Result<DataFrame> {
use crate::array_util::*;
use arrow::array::Array;
use arrow::datatypes::{DataType, TimeUnit};
let column = df.column(column_name)?;
let dtype = column.data_type();
let already_target_type = matches!(
(dtype, target_type),
(DataType::Boolean, ArrayElementType::Boolean)
| (
DataType::Float64 | DataType::Int64 | DataType::Int32 | DataType::Float32,
ArrayElementType::Number,
)
| (DataType::Date32, ArrayElementType::Date)
| (DataType::Timestamp(_, _), ArrayElementType::DateTime)
| (DataType::Time64(_), ArrayElementType::Time)
| (DataType::Utf8, ArrayElementType::String)
);
if already_target_type {
return Ok(df.clone());
}
let new_array: arrow::array::ArrayRef = match target_type {
ArrayElementType::Boolean => match dtype {
DataType::Utf8 => {
let str_arr = as_str(column)?;
let bool_vec: Vec<Option<bool>> = (0..str_arr.len())
.enumerate()
.map(|(idx, i)| {
if str_arr.is_null(i) {
Ok(None)
} else {
match str_arr.value(i).to_lowercase().as_str() {
"true" | "yes" | "1" => Ok(Some(true)),
"false" | "no" | "0" => Ok(Some(false)),
s => Err(GgsqlError::ValidationError(format!(
"Column '{}' row {}: Cannot coerce string '{}' to boolean",
column_name, idx, s
))),
}
}
})
.collect::<Result<Vec<_>>>()?;
new_bool_array(bool_vec)
}
DataType::Int64 | DataType::Int32 | DataType::Float64 | DataType::Float32 => {
let f64_col = cast_array(column, &DataType::Float64)?;
let f64_arr = as_f64(&f64_col)?;
let bool_vec: Vec<Option<bool>> = (0..f64_arr.len())
.map(|i| {
if f64_arr.is_null(i) {
None
} else {
Some(f64_arr.value(i) != 0.0)
}
})
.collect();
new_bool_array(bool_vec)
}
_ => {
return Err(GgsqlError::ValidationError(format!(
"Cannot coerce column '{}' of type {:?} to boolean",
column_name, dtype
)));
}
},
ArrayElementType::Number => cast_array(column, &DataType::Float64).map_err(|e| {
GgsqlError::ValidationError(format!(
"Cannot coerce column '{}' to number: {}",
column_name, e
))
})?,
ArrayElementType::Date => match dtype {
DataType::Utf8 => {
let str_arr = as_str(column)?;
let date_vec: Vec<Option<i32>> = (0..str_arr.len())
.enumerate()
.map(|(idx, i)| {
if str_arr.is_null(i) {
Ok(None)
} else {
let s = str_arr.value(i);
ArrayElement::from_date_string(s)
.and_then(|e| match e {
ArrayElement::Date(d) => Some(d),
_ => None,
})
.ok_or_else(|| {
GgsqlError::ValidationError(format!(
"Column '{}' row {}: Cannot coerce string '{}' to date (expected YYYY-MM-DD)",
column_name, idx, s
))
})
.map(Some)
}
})
.collect::<Result<Vec<_>>>()?;
let i32_arr = new_i32_array(date_vec);
cast_array(&i32_arr, &DataType::Date32)?
}
_ => {
return Err(GgsqlError::ValidationError(format!(
"Cannot coerce column '{}' of type {:?} to date",
column_name, dtype
)));
}
},
ArrayElementType::DateTime => match dtype {
DataType::Utf8 => {
let str_arr = as_str(column)?;
let dt_vec: Vec<Option<i64>> = (0..str_arr.len())
.enumerate()
.map(|(idx, i)| {
if str_arr.is_null(i) {
Ok(None)
} else {
let s = str_arr.value(i);
ArrayElement::from_datetime_string(s)
.and_then(|e| match e {
ArrayElement::DateTime(dt) => Some(dt),
_ => None,
})
.ok_or_else(|| {
GgsqlError::ValidationError(format!(
"Column '{}' row {}: Cannot coerce string '{}' to datetime",
column_name, idx, s
))
})
.map(Some)
}
})
.collect::<Result<Vec<_>>>()?;
let i64_arr = new_i64_array(dt_vec);
cast_array(&i64_arr, &DataType::Timestamp(TimeUnit::Microsecond, None))?
}
_ => {
return Err(GgsqlError::ValidationError(format!(
"Cannot coerce column '{}' of type {:?} to datetime",
column_name, dtype
)));
}
},
ArrayElementType::Time => match dtype {
DataType::Utf8 => {
let str_arr = as_str(column)?;
let time_vec: Vec<Option<i64>> = (0..str_arr.len())
.enumerate()
.map(|(idx, i)| {
if str_arr.is_null(i) {
Ok(None)
} else {
let s = str_arr.value(i);
ArrayElement::from_time_string(s)
.and_then(|e| match e {
ArrayElement::Time(t) => Some(t),
_ => None,
})
.ok_or_else(|| {
GgsqlError::ValidationError(format!(
"Column '{}' row {}: Cannot coerce string '{}' to time (expected HH:MM:SS)",
column_name, idx, s
))
})
.map(Some)
}
})
.collect::<Result<Vec<_>>>()?;
let i64_arr = new_i64_array(time_vec);
cast_array(&i64_arr, &DataType::Time64(TimeUnit::Nanosecond))?
}
_ => {
return Err(GgsqlError::ValidationError(format!(
"Cannot coerce column '{}' of type {:?} to time",
column_name, dtype
)));
}
},
ArrayElementType::String => cast_array(column, &DataType::Utf8).map_err(|e| {
GgsqlError::ValidationError(format!(
"Cannot coerce column '{}' to string: {}",
column_name, e
))
})?,
};
df.with_column(column_name, new_array)
.map_err(|e| GgsqlError::ValidationError(format!("Failed to replace column: {}", e)))
}
pub fn coerce_aesthetic_columns(
layers: &[Layer],
data_map: &mut HashMap<String, DataFrame>,
aesthetic: &str,
target_type: ArrayElementType,
aesthetic_ctx: &AestheticContext,
) -> Result<()> {
let aesthetics_to_check = aesthetic_ctx
.internal_position_family(aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![aesthetic.to_string()]);
let mut coerced: HashSet<(String, String)> = HashSet::new();
for (i, layer) in layers.iter().enumerate() {
let layer_key = naming::layer_key(i);
for aes_name in &aesthetics_to_check {
if let Some(AestheticValue::Column { name, .. }) = layer.mappings.get(aes_name) {
if !data_map.contains_key(&layer_key) {
continue;
}
let key = (layer_key.clone(), name.clone());
if coerced.contains(&key) {
continue;
}
if let Some(df) = data_map.get(&layer_key) {
if df.column(name).is_ok() {
let coerced_df = coerce_column_to_type(df, name, target_type)?;
data_map.insert(layer_key.clone(), coerced_df);
coerced.insert(key);
}
}
}
}
}
Ok(())
}
pub fn resolve_scales(spec: &mut Plot, data_map: &mut HashMap<String, DataFrame>) -> Result<()> {
use crate::plot::projection::CoordKind;
use crate::plot::scale::ScaleDataContext;
let aesthetic_ctx = spec.get_aesthetic_context();
let (is_polar, polar_is_full_circle) = spec
.project
.as_ref()
.map(|p| {
let is_polar = p.coord.coord_kind() == CoordKind::Polar;
if !is_polar {
return (false, false);
}
let start = match p.properties.get("start") {
Some(ParameterValue::Number(n)) => *n,
_ => 0.0,
};
let end = match p.properties.get("end") {
Some(ParameterValue::Number(n)) => Some(*n),
_ => None,
};
let is_full_circle = end.is_none() || end == Some(start);
(true, is_full_circle)
})
.unwrap_or((false, false));
for idx in 0..spec.scales.len() {
let aesthetic = spec.scales[idx].aesthetic.clone();
if spec.scales[idx].resolved {
continue;
}
if let Some(target_type) = infer_scale_target_type(&spec.scales[idx]) {
coerce_aesthetic_columns(
&spec.layers,
data_map,
&aesthetic,
target_type,
&aesthetic_ctx,
)?;
}
let column_refs =
find_columns_for_aesthetic(&spec.layers, &aesthetic, data_map, &aesthetic_ctx);
if column_refs.is_empty() {
continue;
}
if spec.scales[idx].scale_type.is_none() {
spec.scales[idx].scale_type = Some(ScaleType::infer_for_aesthetic(
column_refs[0].data_type(),
&aesthetic,
));
}
let scale_type = spec.scales[idx].scale_type.clone();
if let Some(st) = scale_type {
let use_discrete_range = st.uses_discrete_input_range();
let mut context = ScaleDataContext::from_columns(&column_refs, use_discrete_range);
if is_polar && polar_is_full_circle && aesthetic == "pos2" {
context.default_expand = Some((0.0, 0.0));
}
let display_aes = aesthetic_ctx.map_internal_to_user(&aesthetic);
st.resolve(&mut spec.scales[idx], &context, &aesthetic)
.map_err(|e| {
GgsqlError::ValidationError(format!("Scale '{}': {}", display_aes, e))
})?;
}
}
Ok(())
}
pub fn find_columns_for_aesthetic<'a>(
layers: &[Layer],
aesthetic: &str,
data_map: &'a HashMap<String, DataFrame>,
aesthetic_ctx: &AestheticContext,
) -> Vec<&'a ArrayRef> {
let mut column_refs = Vec::new();
let aesthetics_to_check = aesthetic_ctx
.internal_position_family(aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![aesthetic.to_string()]);
for (i, layer) in layers.iter().enumerate() {
if let Some(df) = data_map.get(&naming::layer_key(i)) {
for aes_name in &aesthetics_to_check {
if let Some(AestheticValue::Column { name, .. }) = layer.mappings.get(aes_name) {
if let Ok(column) = df.column(name) {
column_refs.push(column);
}
}
}
}
}
column_refs
}
pub fn apply_scale_oob(spec: &Plot, data_map: &mut HashMap<String, DataFrame>) -> Result<()> {
let aesthetic_ctx = spec.get_aesthetic_context();
for scale in &spec.scales {
let oob_mode = match scale.properties.get("oob") {
Some(ParameterValue::String(s)) if s != OOB_KEEP => s.as_str(),
Some(ParameterValue::String(_)) => continue, None if scale.explicit_input_range => {
let default = default_oob(&scale.aesthetic);
if default == OOB_KEEP {
continue;
}
default
}
_ => continue,
};
let input_range = match &scale.input_range {
Some(r) if !r.is_empty() => r,
_ => continue,
};
let column_sources = find_columns_for_aesthetic_with_sources(
&spec.layers,
&scale.aesthetic,
data_map,
&aesthetic_ctx,
);
fn is_numeric_element(elem: &ArrayElement) -> bool {
matches!(
elem,
ArrayElement::Number(_)
| ArrayElement::Date(_)
| ArrayElement::DateTime(_)
| ArrayElement::Time(_)
)
}
fn extract_numeric(elem: &ArrayElement) -> Option<f64> {
match elem {
ArrayElement::Number(n) => Some(*n),
ArrayElement::Date(d) => Some(*d as f64),
ArrayElement::DateTime(dt) => Some(*dt as f64),
ArrayElement::Time(t) => Some(*t as f64),
_ => None,
}
}
let is_numeric_range = is_numeric_element(&input_range[0])
&& input_range.get(1).is_some_and(is_numeric_element);
for (data_key, col_name) in column_sources {
if let Some(df) = data_map.get(&data_key) {
if df.column(&col_name).is_err() {
continue;
}
let transformed = if is_numeric_range {
let (range_min, range_max) = match (
extract_numeric(&input_range[0]),
input_range.get(1).and_then(extract_numeric),
) {
(Some(lo), Some(hi)) => (lo, hi),
_ => continue,
};
apply_oob_to_column_numeric(df, &col_name, range_min, range_max, oob_mode)?
} else {
let allowed_values: HashSet<String> = input_range
.iter()
.filter(|elem| !matches!(elem, ArrayElement::Null))
.map(|elem| elem.to_key_string())
.collect();
apply_oob_to_column_discrete(df, &col_name, &allowed_values, oob_mode)?
};
data_map.insert(data_key, transformed);
}
}
}
for scale in &spec.scales {
let should_filter_nulls = scale.explicit_input_range
&& scale
.input_range
.as_ref()
.is_some_and(|range| !range.iter().any(|elem| matches!(elem, ArrayElement::Null)));
if !should_filter_nulls {
continue;
}
let family = aesthetic_ctx
.internal_position_family(&scale.aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![scale.aesthetic.clone()]);
for (i, layer) in spec.layers.iter().enumerate() {
let layer_key = naming::layer_key(i);
if !data_map.contains_key(&layer_key) {
continue;
}
let geom_aesthetics = layer.geom.aesthetics();
for aes_name in &family {
if !geom_aesthetics.is_required(aes_name) {
continue;
}
if let Some(AestheticValue::Column { name, .. }) =
layer.mappings.get(aes_name.as_str())
{
let col_name = name.clone();
if let Some(df) = data_map.get(&layer_key) {
if df.column(&col_name).is_ok() {
let filtered = filter_null_rows(df, &col_name)?;
data_map.insert(layer_key.clone(), filtered);
}
}
}
}
}
}
Ok(())
}
pub fn find_columns_for_aesthetic_with_sources(
layers: &[Layer],
aesthetic: &str,
data_map: &HashMap<String, DataFrame>,
aesthetic_ctx: &AestheticContext,
) -> Vec<(String, String)> {
let mut results = Vec::new();
let aesthetics_to_check = aesthetic_ctx
.internal_position_family(aesthetic)
.map(|f| f.to_vec())
.unwrap_or_else(|| vec![aesthetic.to_string()]);
for (i, layer) in layers.iter().enumerate() {
let layer_key = naming::layer_key(i);
if !data_map.contains_key(&layer_key) {
continue;
}
for aes_name in &aesthetics_to_check {
if let Some(AestheticValue::Column { name, .. }) = layer.mappings.get(aes_name) {
results.push((layer_key.clone(), name.clone()));
}
}
}
results
}
pub fn apply_oob_to_column_numeric(
df: &DataFrame,
col_name: &str,
range_min: f64,
range_max: f64,
oob_mode: &str,
) -> Result<DataFrame> {
use crate::array_util::*;
use arrow::array::Array;
use arrow::datatypes::DataType;
let col = df.column(col_name)?;
let f64_col = cast_array(col, &DataType::Float64).map_err(|_| {
GgsqlError::ValidationError(format!(
"Cannot apply oob to non-numeric column '{}'",
col_name
))
})?;
let f64_arr = as_f64(&f64_col)?;
match oob_mode {
OOB_CENSOR => {
let mask_values: Vec<bool> = (0..f64_arr.len())
.map(|i| {
if f64_arr.is_null(i) {
true } else {
let v = f64_arr.value(i);
v >= range_min && v <= range_max
}
})
.collect();
let mask = arrow::array::BooleanArray::from(mask_values);
let mut new_columns = Vec::new();
let schema = df.schema();
for (i, field) in schema.fields().iter().enumerate() {
let col_arr = df.get_columns()[i].clone();
let filtered = arrow::compute::filter(&col_arr, &mask)
.map_err(|e| GgsqlError::InternalError(format!("Failed to filter: {}", e)))?;
new_columns.push((field.name().as_str(), filtered));
}
DataFrame::new(new_columns)
}
OOB_SQUISH => {
let clamped: Vec<Option<f64>> = (0..f64_arr.len())
.map(|i| {
if f64_arr.is_null(i) {
None
} else {
Some(f64_arr.value(i).clamp(range_min, range_max))
}
})
.collect();
let clamped_array = new_f64_array(clamped);
let original_dtype = col.data_type().clone();
let restored_array = match &original_dtype {
DataType::Date32 | DataType::Timestamp(_, _) | DataType::Time64(_) => {
cast_array(&clamped_array, &original_dtype).map_err(|e| {
GgsqlError::InternalError(format!(
"Failed to restore temporal type for '{}': {}",
col_name, e
))
})?
}
_ => clamped_array,
};
df.with_column(col_name, restored_array)
.map_err(|e| GgsqlError::InternalError(format!("Failed to replace column: {}", e)))
}
_ => Ok(df.clone()),
}
}
pub fn filter_null_rows(df: &DataFrame, col_name: &str) -> Result<DataFrame> {
use arrow::array::Array;
let col = df.column(col_name)?;
let mask_values: Vec<bool> = (0..col.len()).map(|i| !col.is_null(i)).collect();
let mask = arrow::array::BooleanArray::from(mask_values);
let mut new_columns = Vec::new();
let schema = df.schema();
for (i, field) in schema.fields().iter().enumerate() {
let col_arr = df.get_columns()[i].clone();
let filtered = arrow::compute::filter(&col_arr, &mask)
.map_err(|e| GgsqlError::InternalError(format!("Failed to filter NULL rows: {}", e)))?;
new_columns.push((field.name().as_str(), filtered));
}
DataFrame::new(new_columns)
}
pub fn apply_oob_to_column_discrete(
df: &DataFrame,
col_name: &str,
allowed_values: &HashSet<String>,
oob_mode: &str,
) -> Result<DataFrame> {
use crate::array_util::*;
use arrow::array::Array;
if oob_mode != OOB_CENSOR {
return Ok(df.clone());
}
let col = df.column(col_name)?;
let new_values: Vec<Option<String>> = (0..col.len())
.map(|i| {
if col.is_null(i) {
None
} else {
let s = value_to_string(col, i);
if allowed_values.contains(&s) {
Some(s)
} else {
None }
}
})
.collect();
let refs: Vec<Option<&str>> = new_values.iter().map(|o| o.as_deref()).collect();
let new_array = new_str_array(refs);
df.with_column(col_name, new_array)
.map_err(|e| GgsqlError::InternalError(format!("Failed to replace column: {}", e)))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::plot::{ArrayElement, Parameters};
use crate::Geom;
use arrow::datatypes::DataType;
#[test]
fn test_aesthetic_context_internal_family() {
let ctx = AestheticContext::from_static(&["x", "y"], &[]);
let pos1_family = ctx.internal_position_family("pos1").unwrap();
assert!(pos1_family.iter().any(|s| s == "pos1"));
assert!(pos1_family.iter().any(|s| s == "pos1min"));
assert!(pos1_family.iter().any(|s| s == "pos1max"));
assert!(pos1_family.iter().any(|s| s == "pos1end"));
assert_eq!(pos1_family.len(), 4);
let pos2_family = ctx.internal_position_family("pos2").unwrap();
assert!(pos2_family.iter().any(|s| s == "pos2"));
assert!(pos2_family.iter().any(|s| s == "pos2min"));
assert!(pos2_family.iter().any(|s| s == "pos2max"));
assert!(pos2_family.iter().any(|s| s == "pos2end"));
assert_eq!(pos2_family.len(), 4);
assert!(ctx.internal_position_family("color").is_none());
assert!(ctx.internal_position_family("pos1min").is_none());
}
#[test]
fn test_scale_type_infer() {
assert_eq!(ScaleType::infer(&DataType::Int32), ScaleType::continuous());
assert_eq!(ScaleType::infer(&DataType::Int64), ScaleType::continuous());
assert_eq!(
ScaleType::infer(&DataType::Float64),
ScaleType::continuous()
);
assert_eq!(ScaleType::infer(&DataType::UInt16), ScaleType::continuous());
assert_eq!(ScaleType::infer(&DataType::Date32), ScaleType::continuous());
assert_eq!(
ScaleType::infer(&DataType::Timestamp(
arrow::datatypes::TimeUnit::Microsecond,
None
)),
ScaleType::continuous()
);
assert_eq!(
ScaleType::infer(&DataType::Time64(arrow::datatypes::TimeUnit::Nanosecond)),
ScaleType::continuous()
);
assert_eq!(ScaleType::infer(&DataType::Utf8), ScaleType::discrete());
assert_eq!(ScaleType::infer(&DataType::Boolean), ScaleType::discrete());
}
#[test]
fn test_resolve_scales_infers_input_range() {
use crate::df;
let mut spec = Plot::new();
let mut scale = crate::plot::Scale::new("pos1");
scale.properties.insert(
"expand".to_string(),
crate::plot::ParameterValue::Number(0.0),
);
spec.scales.push(scale);
let layer = Layer::new(Geom::point())
.with_aesthetic("pos1".to_string(), AestheticValue::standard_column("value"));
spec.layers.push(layer);
let df = df! {
"value" => vec![1.0f64, 5.0, 10.0]
}
.unwrap();
let mut data_map = HashMap::new();
data_map.insert(naming::layer_key(0), df);
resolve_scales(&mut spec, &mut data_map).unwrap();
let scale = &spec.scales[0];
assert_eq!(scale.scale_type, Some(ScaleType::continuous()));
assert!(scale.input_range.is_some());
let range = scale.input_range.as_ref().unwrap();
assert_eq!(range.len(), 2);
match (&range[0], &range[1]) {
(ArrayElement::Number(min), ArrayElement::Number(max)) => {
assert_eq!(*min, 1.0);
assert_eq!(*max, 10.0);
}
_ => panic!("Expected Number elements"),
}
}
#[test]
fn test_resolve_scales_preserves_explicit_input_range() {
use crate::df;
let mut spec = Plot::new();
let mut scale = crate::plot::Scale::new("pos1");
scale.input_range = Some(vec![ArrayElement::Number(0.0), ArrayElement::Number(100.0)]);
scale.properties.insert(
"expand".to_string(),
crate::plot::ParameterValue::Number(0.0),
);
spec.scales.push(scale);
let layer = Layer::new(Geom::point())
.with_aesthetic("pos1".to_string(), AestheticValue::standard_column("value"));
spec.layers.push(layer);
let df = df! {
"value" => vec![1.0f64, 5.0, 10.0]
}
.unwrap();
let mut data_map = HashMap::new();
data_map.insert(naming::layer_key(0), df);
resolve_scales(&mut spec, &mut data_map).unwrap();
let scale = &spec.scales[0];
let range = scale.input_range.as_ref().unwrap();
match (&range[0], &range[1]) {
(ArrayElement::Number(min), ArrayElement::Number(max)) => {
assert_eq!(*min, 0.0); assert_eq!(*max, 100.0); }
_ => panic!("Expected Number elements"),
}
}
#[test]
fn test_resolve_scales_from_aesthetic_family_input_range() {
use crate::df;
let mut spec = Plot::new();
let scale = crate::plot::Scale::new("pos2");
spec.scales.push(scale);
let layer = Layer::new(Geom::range())
.with_aesthetic(
"pos2min".to_string(),
AestheticValue::standard_column("low"),
)
.with_aesthetic(
"pos2max".to_string(),
AestheticValue::standard_column("high"),
);
spec.layers.push(layer);
let df = df! {
"low" => vec![5.0f64, 10.0, 15.0],
"high" => vec![20.0f64, 25.0, 30.0]
}
.unwrap();
let mut data_map = HashMap::new();
data_map.insert(naming::layer_key(0), df);
resolve_scales(&mut spec, &mut data_map).unwrap();
let scale = &spec.scales[0];
assert!(scale.input_range.is_some());
let range = scale.input_range.as_ref().unwrap();
match (&range[0], &range[1]) {
(ArrayElement::Number(min), ArrayElement::Number(max)) => {
assert!(*min <= 5.0, "min should be at most 5.0, got {}", min);
assert!(*max >= 30.0, "max should be at least 30.0, got {}", max);
}
_ => panic!("Expected Number elements"),
}
}
#[test]
fn test_resolve_scales_partial_input_range_explicit_min_null_max() {
use crate::df;
let mut spec = Plot::new();
let mut scale = crate::plot::Scale::new("pos1");
scale.input_range = Some(vec![ArrayElement::Number(0.0), ArrayElement::Null]);
scale.properties.insert(
"expand".to_string(),
crate::plot::ParameterValue::Number(0.0),
);
spec.scales.push(scale);
let layer = Layer::new(Geom::point())
.with_aesthetic("pos1".to_string(), AestheticValue::standard_column("value"));
spec.layers.push(layer);
let df = df! {
"value" => vec![1.0f64, 5.0, 10.0]
}
.unwrap();
let mut data_map = HashMap::new();
data_map.insert(naming::layer_key(0), df);
resolve_scales(&mut spec, &mut data_map).unwrap();
let scale = &spec.scales[0];
let range = scale.input_range.as_ref().unwrap();
match (&range[0], &range[1]) {
(ArrayElement::Number(min), ArrayElement::Number(max)) => {
assert_eq!(*min, 0.0); assert_eq!(*max, 10.0); }
_ => panic!("Expected Number elements"),
}
}
#[test]
fn test_resolve_scales_partial_input_range_null_min_explicit_max() {
use crate::df;
let mut spec = Plot::new();
let mut scale = crate::plot::Scale::new("pos1");
scale.input_range = Some(vec![ArrayElement::Null, ArrayElement::Number(100.0)]);
scale.properties.insert(
"expand".to_string(),
crate::plot::ParameterValue::Number(0.0),
);
spec.scales.push(scale);
let layer = Layer::new(Geom::point())
.with_aesthetic("pos1".to_string(), AestheticValue::standard_column("value"));
spec.layers.push(layer);
let df = df! {
"value" => vec![1.0f64, 5.0, 10.0]
}
.unwrap();
let mut data_map = HashMap::new();
data_map.insert(naming::layer_key(0), df);
resolve_scales(&mut spec, &mut data_map).unwrap();
let scale = &spec.scales[0];
let range = scale.input_range.as_ref().unwrap();
match (&range[0], &range[1]) {
(ArrayElement::Number(min), ArrayElement::Number(max)) => {
assert_eq!(*min, 1.0); assert_eq!(*max, 100.0); }
_ => panic!("Expected Number elements"),
}
}
#[test]
fn test_resolve_scales_polar_theta_no_expansion() {
use crate::df;
use crate::plot::projection::{Coord, Projection};
let mut spec = Plot::new();
let coord = Coord::polar();
let aesthetics = coord
.position_aesthetic_names()
.iter()
.map(|s| s.to_string())
.collect();
spec.project = Some(Projection {
coord,
aesthetics,
properties: Parameters::new(),
computed: Parameters::new(),
});
let scale = crate::plot::Scale::new("pos2");
spec.scales.push(scale);
let layer = Layer::new(Geom::bar())
.with_aesthetic("pos2".to_string(), AestheticValue::standard_column("value"));
spec.layers.push(layer);
let df = df! {
"value" => vec![10.0f64, 20.0, 30.0]
}
.unwrap();
let mut data_map = HashMap::new();
data_map.insert(naming::layer_key(0), df);
assert!(spec.project.is_some(), "project should be set");
let coord_kind = spec.project.as_ref().map(|p| p.coord.coord_kind());
assert_eq!(
coord_kind,
Some(crate::plot::CoordKind::Polar),
"coord_kind should be Polar"
);
resolve_scales(&mut spec, &mut data_map).unwrap();
let scale = &spec.scales[0];
assert!(scale.input_range.is_some());
let range = scale.input_range.as_ref().unwrap();
assert_eq!(range.len(), 2);
match (&range[0], &range[1]) {
(ArrayElement::Number(min), ArrayElement::Number(max)) => {
assert_eq!(*min, 10.0, "min should be 10.0 (no expansion)");
assert_eq!(*max, 30.0, "max should be 30.0 (no expansion)");
}
_ => panic!("Expected Number elements"),
}
}
#[test]
fn test_apply_oob_censor_date32() {
use arrow::array::{ArrayRef, Date32Array};
use std::sync::Arc;
let dates: ArrayRef = Arc::new(Date32Array::from(vec![19723, 19875, 20058]));
let df = DataFrame::new(vec![("date", dates)]).unwrap();
let result = apply_oob_to_column_numeric(&df, "date", 19783.0, 19967.0, OOB_CENSOR)
.expect("oob censor should handle Date32");
assert_eq!(result.height(), 1);
}
#[test]
fn test_apply_oob_squish_date32_restores_temporal_type() {
use arrow::array::{ArrayRef, Date32Array};
use std::sync::Arc;
let dates: ArrayRef = Arc::new(Date32Array::from(vec![19000, 19875, 21000]));
let df = DataFrame::new(vec![("date", dates)]).unwrap();
let result = apply_oob_to_column_numeric(&df, "date", 19723.0, 20089.0, OOB_SQUISH)
.expect("oob squish should handle Date32");
assert_eq!(
result.column("date").unwrap().data_type(),
&DataType::Date32
);
}
mod scale_error_translation_tests {
#[cfg(feature = "duckdb")]
use crate::reader::DuckDBReader;
#[cfg(feature = "duckdb")]
use crate::reader::Reader;
#[cfg(feature = "duckdb")]
use crate::GgsqlError;
#[cfg(feature = "duckdb")]
#[test]
fn facet_continuous_scale_uses_panel_name_not_facet1() {
let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap();
let query = r#"
SELECT 1 AS x, 2 AS y, 'a' AS region
VISUALISE x, y
DRAW point
FACET region
SCALE CONTINUOUS panel
"#;
let msg = match reader.execute(query) {
Err(GgsqlError::ValidationError(s)) => s,
Err(other) => panic!("expected ValidationError, got: {}", other),
Ok(_) => panic!("expected error, got success"),
};
assert_eq!(
msg,
"SCALE panel: facet variables require Discrete or Binned scales, got Continuous. \
Use SCALE BINNED panel to bin continuous data."
);
}
#[cfg(feature = "duckdb")]
#[test]
fn facet_continuous_scale_uses_row_name_not_facet1_in_grid() {
let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap();
let query = r#"
SELECT 1 AS x, 2 AS y, 'a' AS region, 'b' AS category
VISUALISE x, y
DRAW point
FACET region BY category
SCALE CONTINUOUS row
"#;
let msg = match reader.execute(query) {
Err(GgsqlError::ValidationError(s)) => s,
Err(other) => panic!("expected ValidationError, got: {}", other),
Ok(_) => panic!("expected error, got success"),
};
assert_eq!(
msg,
"SCALE row: facet variables require Discrete or Binned scales, got Continuous. \
Use SCALE BINNED row to bin continuous data."
);
}
#[cfg(feature = "duckdb")]
#[test]
fn explicit_scale_type_dtype_error_uses_x_name_not_pos1() {
let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap();
let query = r#"
SELECT 'a' AS x, 1 AS y
VISUALISE x, y
DRAW point
SCALE CONTINUOUS x
"#;
let msg = match reader.execute(query) {
Err(GgsqlError::ValidationError(s)) => s,
Err(other) => panic!("expected ValidationError, got: {}", other),
Ok(_) => panic!("expected error, got success"),
};
assert!(
msg.starts_with("Scale 'x':"),
"expected message to start with \"Scale 'x':\", got: {}",
msg
);
assert!(
!msg.contains("pos1"),
"message must not mention internal name 'pos1', got: {}",
msg
);
assert!(
!msg.contains("__ggsql_aes_"),
"message must not mention raw column name, got: {}",
msg
);
}
#[cfg(feature = "duckdb")]
#[test]
fn explicit_scale_type_dtype_error_uses_angle_name_under_polar() {
let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap();
let query = r#"
SELECT 'a' AS angle, 1 AS radius
VISUALISE angle, radius
DRAW point
PROJECT TO polar
SCALE CONTINUOUS angle
"#;
let msg = match reader.execute(query) {
Err(GgsqlError::ValidationError(s)) => s,
Err(other) => panic!("expected ValidationError, got: {}", other),
Ok(_) => panic!("expected error, got success"),
};
assert!(
msg.starts_with("Scale 'angle':"),
"expected message to start with \"Scale 'angle':\", got: {}",
msg
);
assert!(
!msg.contains("pos1"),
"message must not mention internal name 'pos1', got: {}",
msg
);
}
}
}