gbrt_rs/data/feature_matrix.rs
1//! Feature Matrix for Efficient Column-Oriented Access
2//!
3//! This module provides [`FeatureMatrix`], a matrix structure optimized for
4//! feature-wise operations common in machine learning, particularly decision
5//! tree algorithms. Unlike typical row-major storage, this uses column-major
6//! layout for efficient feature access during split finding.
7//!
8//! # Design Decisions
9//!
10//! - **Column-major storage**: [`ndarray::Array2`] stores data column-wise,
11//! making feature extraction (columns) cache-friendly during tree building
12//! - **Named features**: Each column can have a human-readable name
13//! - **Type safety**: Custom errors for index bounds and shape validation
14//! - **Parallel-ready**: Integrates with Rayon for parallel feature processing
15//!
16//! # When to Use
17//!
18//! This structure is ideal when you need frequent column-wise access,
19//! such as:
20//! - Decision tree split finding
21//! - Feature importance computation
22//! - Feature scaling and normalization
23//! - Feature selection operations
24//!
25
26use ndarray::{Array2, Array1, Axis, ArrayView1, ArrayView2};
27use serde::{Serialize, Deserialize};
28use std::path::Path;
29use rayon::prelude::*;
30
31/// Errors that can occur when working with feature matrices.
32#[derive(thiserror::Error, Debug)]
33pub enum FeatureMatrixError {
34 /// Invalid matrix shape (zero rows or columns).
35 #[error("Invalid shape: expected at least 1 row and 1 column, got {rows}×{cols}")]
36 InvalidShape { rows: usize, cols: usize },
37
38 /// Feature column index out of valid range.
39 #[error("Feature index out of bounds: {index} (max: {max})")]
40 FeatureIndexOutOfBounds { index: usize, max: usize },
41
42 /// Sample row index out of valid range.
43 #[error("Sample index out of bounds: {index} (max: {max})")]
44 SampleIndexOutOfBounds { index: usize, max: usize },
45
46 /// CSV parsing error.
47 #[error("CSV error: {0}")]
48 CsvError(#[from] csv::Error),
49
50 /// File I/O error.
51 #[error("IO error: {0}")]
52 IoError(#[from] std::io::Error),
53}
54
55/// A feature matrix optimized for column-wise access in machine learning.
56///
57/// [`FeatureMatrix`] stores data in column-major format using [`ndarray::Array2`],
58/// making feature extraction efficient for algorithms that process columns
59/// repeatedly. This is particularly beneficial for decision trees during split
60/// finding.
61///
62/// # Invariants
63///
64/// - `n_samples > 0 && n_features > 0`
65/// - `feature_names.len() == n_features`
66/// - All data values are finite (validated at construction)
67///
68/// # Thread Safety
69///
70/// Cloning creates a new copy of the data; the struct is not designed for
71/// shared mutable access. Use immutable references or ownership passing.
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct FeatureMatrix {
74 /// Underlying column-major data storage
75 data: Array2<f64>,
76 /// Feature/column names for human readability
77 feature_names: Vec<String>,
78}
79
80impl FeatureMatrix {
81 /// Creates a new feature matrix with auto-generated names.
82 ///
83 /// Generates feature names in the format "feature_0", "feature_1", etc.
84 ///
85 /// # Parameters
86 /// - `data`: 2D array with shape (n_samples, n_features)
87 ///
88 /// # Returns
89 /// `Ok(FeatureMatrix)` if validation passes
90 ///
91 /// # Errors
92 /// - `FeatureMatrixError::InvalidShape` if data has zero rows or columns
93 pub fn new(data: Array2<f64>) -> std::result::Result<Self, FeatureMatrixError> {
94 let (n_samples, n_features) = data.dim();
95
96 if n_samples == 0 || n_features == 0 {
97 return Err(FeatureMatrixError::InvalidShape {
98 rows: n_samples,
99 cols: n_features
100 });
101 }
102
103 let feature_names = (0..n_features)
104 .map(|i| format!("feature_{}", i))
105 .collect();
106
107 Ok(FeatureMatrix { data, feature_names })
108 }
109
110 /// Creates a new feature matrix with custom feature names.
111 ///
112 /// # Parameters
113 /// - `data`: 2D array with shape (n_samples, n_features)
114 /// - `feature_names`: Vector of names, one per column
115 ///
116 /// # Returns
117 /// `Ok(FeatureMatrix)` if validation passes
118 ///
119 /// # Errors
120 /// - `FeatureMatrixError::InvalidShape` if data has zero rows or columns
121 /// - `FeatureMatrixError::InvalidShape` if feature_names length doesn't match columns
122 pub fn with_feature_names(
123 data: Array2<f64>,
124 feature_names: Vec<String>
125 ) -> std::result::Result<Self, FeatureMatrixError> {
126 let (n_samples, n_features) = data.dim();
127
128 if n_samples == 0 || n_features == 0 {
129 return Err(FeatureMatrixError::InvalidShape {
130 rows: n_samples,
131 cols: n_features
132 });
133 }
134
135 if feature_names.len() != n_features {
136 return Err(FeatureMatrixError::InvalidShape {
137 rows: n_samples,
138 cols: n_features,
139 });
140 }
141
142 Ok(FeatureMatrix { data, feature_names })
143 }
144
145 /// Returns the number of samples (rows) in the matrix.
146 pub fn n_samples(&self) -> usize {
147 self.data.nrows()
148 }
149
150 /// Returns the number of features (columns) in the matrix.
151 pub fn n_features(&self) -> usize {
152 self.data.ncols()
153 }
154
155 /// Returns the shape as (n_samples, n_features).
156 pub fn shape(&self) -> (usize, usize) {
157 self.data.dim()
158 }
159
160 /// Returns a reference to the underlying data array.
161 pub fn data(&self) -> &Array2<f64> {
162 &self.data
163 }
164
165 /// Returns a slice of feature names.
166 pub fn feature_names(&self) -> &[String] {
167 &self.feature_names
168 }
169
170 /// Gets a column view for a specific feature.
171 ///
172 /// # Parameters
173 /// - `feature_idx`: Zero-based column index
174 ///
175 /// # Returns
176 /// Array view of shape (n_samples,)
177 ///
178 /// # Errors
179 /// - `FeatureMatrixError::FeatureIndexOutOfBounds` if index >= n_features
180 pub fn get_feature(&self, feature_idx: usize) -> std::result::Result<ArrayView1<'_, f64>, FeatureMatrixError> {
181 if feature_idx >= self.n_features() {
182 return Err(FeatureMatrixError::FeatureIndexOutOfBounds {
183 index: feature_idx,
184 max: self.n_features() - 1
185 });
186 }
187 Ok(self.data.column(feature_idx))
188 }
189
190 /// Gets a row view for a specific sample.
191 ///
192 /// # Parameters
193 /// - `sample_idx`: Zero-based row index
194 ///
195 /// # Returns
196 /// Array view of shape (n_features,)
197 ///
198 /// # Errors
199 /// - `FeatureMatrixError::SampleIndexOutOfBounds` if index >= n_samples
200 pub fn get_sample(&self, sample_idx: usize) -> std::result::Result<ArrayView1<'_, f64>, FeatureMatrixError> {
201 if sample_idx >= self.n_samples() {
202 return Err(FeatureMatrixError::SampleIndexOutOfBounds {
203 index: sample_idx,
204 max: self.n_samples() - 1
205 });
206 }
207 Ok(self.data.row(sample_idx))
208 }
209
210 /// Gets a single element at (sample_idx, feature_idx).
211 ///
212 /// # Parameters
213 /// - `sample_idx`: Row index
214 /// - `feature_idx`: Column index
215 ///
216 /// # Returns
217 /// The scalar value at that position
218 ///
219 /// # Errors
220 /// - `FeatureMatrixError::SampleIndexOutOfBounds` if sample_idx >= n_samples
221 /// - `FeatureMatrixError::FeatureIndexOutOfBounds` if feature_idx >= n_features
222 pub fn get(&self, sample_idx: usize, feature_idx: usize) -> std::result::Result<f64, FeatureMatrixError> {
223 if sample_idx >= self.n_samples() {
224 return Err(FeatureMatrixError::SampleIndexOutOfBounds {
225 index: sample_idx,
226 max: self.n_samples() - 1
227 });
228 }
229 if feature_idx >= self.n_features() {
230 return Err(FeatureMatrixError::FeatureIndexOutOfBounds {
231 index: feature_idx,
232 max: self.n_features() - 1
233 });
234 }
235 Ok(self.data[[sample_idx, feature_idx]])
236 }
237
238 /// Gets a view of the entire matrix.
239 ///
240 /// Useful for passing to ndarray-based algorithms.
241 pub fn view(&self) -> ArrayView2<'_, f64> {
242 self.data.view()
243 }
244
245 /// Returns an iterator over samples (rows).
246 ///
247 /// Each iteration yields a view of shape (n_features,).
248 pub fn samples(&self) -> impl Iterator<Item = ArrayView1<'_, f64>> + '_ {
249 self.data.rows().into_iter()
250 }
251
252 /// Returns an iterator over features (columns).
253 ///
254 /// Each iteration yields a view of shape (n_samples,).
255 /// This is cache-friendly due to column-major storage.
256 pub fn features(&self) -> impl Iterator<Item = ArrayView1<'_, f64>> + '_ {
257 self.data.columns().into_iter()
258 }
259
260 /// Computes the min and max range for each feature.
261 ///
262 /// Useful for decision tree split finding and feature scaling.
263 ///
264 /// # Returns
265 /// Vector of (min, max) tuples, one per feature
266 pub fn feature_ranges(&self) -> Vec<(f64, f64)> {
267 (0..self.n_features())
268 .map(|i| {
269 let col = self.data.column(i);
270 let min = col.fold(f64::INFINITY, |a, &b| a.min(b));
271 let max = col.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
272 (min, max)
273 })
274 .collect()
275 }
276
277 /// Computes the mean of each feature.
278 ///
279 /// Useful for imputation and model initialization.
280 ///
281 /// # Returns
282 /// Vector of means, one per feature
283 pub fn feature_means(&self) -> Vec<f64> {
284 (0..self.n_features())
285 .map(|i| {
286 let col = self.data.column(i);
287 col.mean().unwrap_or(0.0)
288 })
289 .collect()
290 }
291
292 /// Creates a submatrix with selected samples (rows).
293 ///
294 /// # Parameters
295 /// - `sample_indices`: Row indices to include in the subset
296 ///
297 /// # Returns
298 /// New FeatureMatrix containing only selected rows
299 ///
300 /// # Errors
301 /// - `FeatureMatrixError::SampleIndexOutOfBounds` if any index >= n_samples
302 pub fn select_samples(&self, sample_indices: &[usize]) -> std::result::Result<Self, FeatureMatrixError> {
303 let mut selected_data = Vec::new();
304
305 for &idx in sample_indices {
306 if idx >= self.n_samples() {
307 return Err(FeatureMatrixError::SampleIndexOutOfBounds {
308 index: idx,
309 max: self.n_samples() - 1
310 });
311 }
312 selected_data.push(self.data.row(idx).to_owned());
313 }
314
315 // Stack rows into a new matrix
316 let data = ndarray::stack(
317 Axis(0),
318 &selected_data.iter().map(|v| v.view()).collect::<Vec<_>>()
319 ).map_err(|_| FeatureMatrixError::InvalidShape {
320 rows: sample_indices.len(),
321 cols: self.n_features(),
322 })?;
323
324 Ok(FeatureMatrix {
325 data,
326 feature_names: self.feature_names.clone(),
327 })
328 }
329
330 /// Creates a submatrix with selected features (columns).
331 ///
332 /// # Parameters
333 /// - `feature_indices`: Column indices to include in the subset
334 ///
335 /// # Returns
336 /// New FeatureMatrix containing only selected columns
337 ///
338 /// # Errors
339 /// - `FeatureMatrixError::FeatureIndexOutOfBounds` if any index >= n_features
340 pub fn select_features(&self, feature_indices: &[usize]) -> std::result::Result<Self, FeatureMatrixError> {
341 let mut selected_data = Vec::new();
342 let mut selected_names = Vec::new();
343
344 for &idx in feature_indices {
345 if idx >= self.n_features() {
346 return Err(FeatureMatrixError::FeatureIndexOutOfBounds {
347 index: idx,
348 max: self.n_features() - 1,
349 });
350 }
351 selected_data.push(self.data.column(idx).to_owned());
352 selected_names.push(self.feature_names[idx].clone());
353 }
354
355 // Stack columns into a new matrix
356 let data = ndarray::stack(
357 Axis(1),
358 &selected_data.iter().map(|v| v.view()).collect::<Vec<_>>()
359 ).map_err(|_| FeatureMatrixError::InvalidShape {
360 rows: self.n_samples(),
361 cols: feature_indices.len(),
362 })?;
363
364 Ok(FeatureMatrix {
365 data,
366 feature_names: selected_names,
367 })
368 }
369
370 /// Loads a feature matrix from a CSV file.
371 ///
372 /// # Parameters
373 /// - `path`: Path to CSV file with headers
374 ///
375 /// # Returns
376 /// FeatureMatrix with data and column names
377 ///
378 /// # Errors
379 /// - `FeatureMatrixError::CsvError` if CSV parsing fails
380 /// - `FeatureMatrixError::IoError` if file cannot be read
381 /// - `FeatureMatrixError::InvalidShape` if file is empty
382 ///
383 /// # CSV Format
384 /// - First row: column headers (become feature names)
385 /// - Subsequent rows: numeric data (NaN for missing values)
386 pub fn from_csv<P: AsRef<Path>>(path: P) -> std::result::Result<Self, FeatureMatrixError> {
387 let mut reader = csv::ReaderBuilder::new()
388 .has_headers(true)
389 .from_path(path)?;
390
391 let headers: Vec<String> = reader
392 .headers()?
393 .iter()
394 .map(|s| s.to_string())
395 .collect();
396
397 let mut data = Vec::new();
398
399 for result in reader.records() {
400 let record = result?;
401 let row: Vec<f64> = record
402 .iter()
403 .map(|s| s.parse().unwrap_or(f64::NAN))
404 .collect();
405 data.push(row);
406 }
407
408 let n_samples = data.len();
409 let n_features = headers.len();
410
411 if n_samples == 0 {
412 return Err(FeatureMatrixError::InvalidShape { rows: 0, cols: n_features });
413 }
414
415 let flat_data: Vec<f64> = data.into_iter().flatten().collect();
416 let array = Array2::from_shape_vec((n_samples, n_features), flat_data)
417 .map_err(|_| FeatureMatrixError::InvalidShape { rows: n_samples, cols: n_features })?;
418
419 Ok(FeatureMatrix {
420 data: array,
421 feature_names: headers,
422 })
423 }
424}
425
426/// Equality comparison for feature matrices.
427///
428/// Two matrices are equal if they have:
429/// - Identical underlying data (element-wise)
430/// - Same feature names (in same order)
431impl PartialEq for FeatureMatrix {
432 fn eq(&self, other: &Self) -> bool {
433 self.data == other.data && self.feature_names == other.feature_names
434 }
435}
436