1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
use crate::TEMP_SUFFIX;
use crate::chart::Chart;
use crate::core::data::{ColumnVector, Dataset};
use crate::error::ChartonError;
use crate::mark::Mark;
use ahash::AHashMap;
impl<T: Mark> Chart<T> {
/// Handle grouping and aggregation of data for histogram chart.
/// Uses explicit unique_values() for color to maintain consistent appearance order.
pub(crate) fn transform_histogram_data(mut self) -> Result<Self, ChartonError> {
// --- STEP 1: Extract Encodings ---
let x_enc = self
.encoding
.x
.as_ref()
.ok_or_else(|| ChartonError::Encoding("X missing".into()))?;
let y_enc = self
.encoding
.y
.as_ref()
.ok_or_else(|| ChartonError::Encoding("Y missing".into()))?;
let color_enc = self.encoding.color.as_ref();
let bin_field = &x_enc.field;
let count_field = &y_enc.field;
// --- STEP 2: Calculate Binning Parameters ---
let x_col = self.data.column(bin_field)?;
let (min_val, max_val) = x_col.min_max();
let n_bins = x_enc.bins.unwrap_or(10);
let bin_width = if n_bins > 1 {
(max_val - min_val) / (n_bins as f64)
} else {
1.0
};
// Pre-calculate bin midpoints (Natural numeric order for X-axis)
let bin_middles: Vec<f64> = (0..n_bins)
.map(|i| min_val + (i as f64 + 0.5) * bin_width)
.collect();
// --- STEP 3: Establish Deterministic Order for Color ---
// We use unique_values() to capture the first-appearance order.
let color_list: Vec<String> = if let Some(c_enc) = color_enc {
self.data.column(&c_enc.field)?.unique_values()
} else {
vec![format!("{}_default", TEMP_SUFFIX)]
};
// --- STEP 4: Aggregate Counts (The "Group By" phase) ---
// Key: (bin_index, color_label), Value: count
let mut lookup: AHashMap<(usize, String), f64> = AHashMap::new();
let row_count = self.data.height();
for i in 0..row_count {
let val = x_col.get_f64(i).unwrap_or(min_val);
// Calculate which bin this value falls into
let bin_idx = (((val - min_val) / bin_width).floor() as usize).min(n_bins - 1);
let color_label = if let Some(c_enc) = color_enc {
self.data.get_str_or(&c_enc.field, i, "null")
} else {
format!("{}_default", TEMP_SUFFIX)
};
*lookup.entry((bin_idx, color_label)).or_insert(0.0) += 1.0;
}
// --- STEP 5: Apply Normalization (Optional) ---
if y_enc.normalize {
if color_enc.is_some() {
// Normalize within each color group: sum(counts per color) = 1.0
let mut color_sums = AHashMap::new();
for ((_, color), count) in &lookup {
*color_sums.entry(color.clone()).or_insert(0.0) += *count;
}
for ((_, color), count) in lookup.iter_mut() {
let total = color_sums.get(color).copied().unwrap_or(1.0);
if total > 0.0 {
*count /= total;
}
}
} else {
// Global normalization: sum(all counts) = 1.0
let total: f64 = lookup.values().sum();
if total > 0.0 {
for count in lookup.values_mut() {
*count /= total;
}
}
}
}
// --- STEP 6: Cartesian Product & Gap Filling (Using the established order) ---
// We iterate over the fixed bin indices and the ordered color_list.
let mut final_x = Vec::new();
let mut final_y = Vec::new();
let mut final_color = Vec::new();
for (bin_idx, &mid) in bin_middles.iter().enumerate().take(n_bins) {
for color in &color_list {
let count = lookup
.get(&(bin_idx, color.clone()))
.copied()
.unwrap_or(0.0);
final_x.push(mid);
final_y.push(count);
if color_enc.is_some() {
final_color.push(color.clone());
}
}
}
// --- STEP 7: Rebuild Dataset ---
let mut new_ds = Dataset::new();
new_ds.add_column(bin_field, ColumnVector::F64 { data: final_x })?;
new_ds.add_column(count_field, ColumnVector::F64 { data: final_y })?;
if let Some(c_enc) = color_enc {
new_ds.add_column(
&c_enc.field,
ColumnVector::String {
data: final_color,
validity: None, // Cartesian product ensures every slot is filled with a valid String
},
)?;
}
self.data = new_ds;
Ok(self)
}
}