1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
use crate::chart::Chart;
use crate::error::ChartonError;
use crate::mark::Mark;
use crate::prelude::IntoChartonSource;
use polars::prelude::*;
impl<T: Mark> Chart<T> {
// Handle grouping and aggregation of data for histogram chart
pub(crate) fn transform_histogram_data(mut self) -> Result<Self, ChartonError> {
// Check if we have the required encodings
let x_encoding = self.encoding.x.as_ref().unwrap();
let y_encoding = self.encoding.y.as_ref().unwrap();
// Now perform the data transformation
let (bin_field, count_field) = (x_encoding.field.clone(), y_encoding.field.clone());
// Handle continuous data by binning
let processed_df = {
let mut df = self.data.df.clone();
// Get the x series data (already converted to f64)
let x_series = df.column(&bin_field)?.f64()?.clone().into_series();
// Calculate number of bins. Now we can safely unwrap because apply_default_encodings
// has already resolved this value.
let n_bins = x_encoding.bins.unwrap();
// Get min and max values for binning using Polars' built-in methods
let min_val = x_series
.f64()?
.min()
.expect("Internal error: Failed to calculate minimum value for histogram data");
let max_val = x_series
.f64()?
.max()
.expect("Internal error: Failed to calculate maximum value for histogram data");
// Create bins. bin_width is used to calculate the data range of each bin
let bin_width = if n_bins > 1 {
(max_val - min_val) / (n_bins as f64)
} else {
1.0 // arbitrary non-zero value when n_bins = 1
};
let mut bins = Vec::with_capacity(n_bins + 1);
for i in 0..=n_bins {
bins.push(min_val + (i as f64) * bin_width);
}
// Store bin labels for later use
let labels: Vec<String> = (0..n_bins).map(|i| format!("bin_{}", i)).collect();
// Calculate middle values of bins
let middles: Vec<f64> = bins.windows(2).map(|w| (w[0] + w[1]) / 2.0).collect();
// Create binned column
let binned_series = crate::stats::stat_binning::cut(&x_series, &bins, &labels);
let renamed_series = binned_series.with_name((&bin_field).into());
df.with_column(renamed_series)?;
// Group by bins and count occurrences, using count_field as the column name
// Handle color encoding similar to bar charts
let grouped_df = if let Some(color_encoding) = &self.encoding.color {
// If we have color encoding, group by both bin field and color field
df.lazy()
.group_by_stable([col(&bin_field), col(&color_encoding.field)])
.agg([col(&bin_field).count().alias(&count_field)])
.collect()?
} else {
// If no color encoding, group by bin field only
df.lazy()
.group_by_stable([col(&bin_field)])
.agg([col(&bin_field).count().alias(&count_field)])
.collect()?
};
// Apply normalization if requested
let grouped_df = if y_encoding.normalize {
if let Some(color_encoding) = &self.encoding.color {
// Normalize within each color group (each group sums to 1)
grouped_df
.lazy()
.with_column(
(col(&count_field).cast(DataType::Float64)
/ col(&count_field).sum().over([col(&color_encoding.field)]))
.alias(&count_field),
)
.collect()?
} else {
// Normalize all values to sum to 1
grouped_df
.lazy()
.with_column(
(col(&count_field).cast(DataType::Float64) / col(&count_field).sum())
.alias(&count_field),
)
.collect()?
}
} else {
grouped_df
.lazy()
.with_column(col(&count_field).cast(DataType::Float64))
.collect()?
};
// Create all possible bin labels to ensure empty bins are included
let all_bin_labels: Vec<String> = (0..n_bins).map(|i| format!("bin_{}", i)).collect();
// Handle color encoding when filling missing combinations
let filled_df = if let Some(color_encoding) = &self.encoding.color {
// Get unique color values
let color_unique_series =
grouped_df.column(&color_encoding.field)?.unique_stable()?;
let color_values: Vec<String> = color_unique_series
.str()?
.into_no_null_iter()
.map(|s| s.to_string())
.collect();
// Create all combinations of bin labels and color values
let bin_repeated: Vec<String> = all_bin_labels
.iter()
.flat_map(|bin| vec![bin.clone(); color_values.len()])
.collect();
let color_repeated: Vec<String> = color_values
.iter()
.cycle()
.take(all_bin_labels.len() * color_values.len())
.cloned()
.collect();
// Create DataFrame with all combinations
let all_combinations_df = df![
&bin_field => bin_repeated,
&color_encoding.field => color_repeated
]?;
// Join with the grouped data to fill in missing combinations
all_combinations_df
.lazy()
.join(
grouped_df.lazy(),
[col(&bin_field), col(&color_encoding.field)],
[col(&bin_field), col(&color_encoding.field)],
JoinType::Left.into(),
)
.collect()?
.lazy()
.with_column(col(&count_field).fill_null(lit(0)))
.collect()?
} else {
// Create DataFrame with all bins for no color encoding case
let all_bins_df = df![
&bin_field => all_bin_labels
]?;
// Join with the grouped data to include zero counts for empty bins
all_bins_df
.lazy()
.join(
grouped_df.lazy(),
[col(&bin_field)],
[col(&bin_field)],
JoinType::Left.into(),
)
.collect()?
.lazy()
.with_column(col(&count_field).fill_null(lit(0)))
.collect()?
};
// Replace bin labels with middle values
let mut label_to_middle = std::collections::HashMap::new();
for (label, &middle) in (0..n_bins)
.map(|i| format!("bin_{}", i))
.zip(middles.iter())
{
label_to_middle.insert(label, middle);
}
// Map the bin column values to middle values
let bin_series = filled_df
.column(&bin_field)?
.str()
.expect("Bin field should be string type");
let new_bin_values: Vec<Option<f64>> = bin_series
.into_iter()
.map(|opt_val| opt_val.and_then(|val| label_to_middle.get(val).copied()))
.collect();
let new_bin_series = Series::new((&bin_field).into(), new_bin_values);
let mut result_df = filled_df;
// Replace the column (e.g. bin_field) while maintaining column order
// with_column will replace the existing column with the same name
result_df.with_column(new_bin_series)?;
result_df
};
self.data = (&processed_df).into_source()?;
Ok(self)
}
}