gbrt_rs/data/
feature_matrix.rs

1//! Feature Matrix for Efficient Column-Oriented Access
2//! 
3//! This module provides [`FeatureMatrix`], a matrix structure optimized for
4//! feature-wise operations common in machine learning, particularly decision
5//! tree algorithms. Unlike typical row-major storage, this uses column-major
6//! layout for efficient feature access during split finding.
7//! 
8//! # Design Decisions
9//! 
10//! - **Column-major storage**: [`ndarray::Array2`] stores data column-wise,
11//!   making feature extraction (columns) cache-friendly during tree building
12//! - **Named features**: Each column can have a human-readable name
13//! - **Type safety**: Custom errors for index bounds and shape validation
14//! - **Parallel-ready**: Integrates with Rayon for parallel feature processing
15//! 
16//! # When to Use
17//! 
18//! This structure is ideal when you need frequent column-wise access,
19//! such as:
20//! - Decision tree split finding
21//! - Feature importance computation
22//! - Feature scaling and normalization
23//! - Feature selection operations
24//!
25
26use ndarray::{Array2, Array1, Axis, ArrayView1, ArrayView2};
27use serde::{Serialize, Deserialize};
28use std::path::Path;
29use rayon::prelude::*;
30
31/// Errors that can occur when working with feature matrices.
32#[derive(thiserror::Error, Debug)]
33pub enum FeatureMatrixError {
34    /// Invalid matrix shape (zero rows or columns).
35    #[error("Invalid shape: expected at least 1 row and 1 column, got {rows}×{cols}")]
36    InvalidShape { rows: usize, cols: usize },
37    
38    /// Feature column index out of valid range.
39    #[error("Feature index out of bounds: {index} (max: {max})")]
40    FeatureIndexOutOfBounds { index: usize, max: usize },
41    
42    /// Sample row index out of valid range.    
43    #[error("Sample index out of bounds: {index} (max: {max})")]
44    SampleIndexOutOfBounds { index: usize, max: usize },
45
46    /// CSV parsing error.    
47    #[error("CSV error: {0}")]
48    CsvError(#[from] csv::Error),
49
50    /// File I/O error.    
51    #[error("IO error: {0}")]
52    IoError(#[from] std::io::Error),
53}
54
55/// A feature matrix optimized for column-wise access in machine learning.
56/// 
57/// [`FeatureMatrix`] stores data in column-major format using [`ndarray::Array2`],
58/// making feature extraction efficient for algorithms that process columns
59/// repeatedly. This is particularly beneficial for decision trees during split
60/// finding.
61/// 
62/// # Invariants
63/// 
64/// - `n_samples > 0 && n_features > 0`
65/// - `feature_names.len() == n_features`
66/// - All data values are finite (validated at construction)
67/// 
68/// # Thread Safety
69/// 
70/// Cloning creates a new copy of the data; the struct is not designed for
71/// shared mutable access. Use immutable references or ownership passing.
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct FeatureMatrix {
74    /// Underlying column-major data storage
75    data: Array2<f64>,
76    /// Feature/column names for human readability
77    feature_names: Vec<String>,
78}
79
80impl FeatureMatrix {
81    /// Creates a new feature matrix with auto-generated names.
82    /// 
83    /// Generates feature names in the format "feature_0", "feature_1", etc.
84    /// 
85    /// # Parameters
86    /// - `data`: 2D array with shape (n_samples, n_features)
87    /// 
88    /// # Returns
89    /// `Ok(FeatureMatrix)` if validation passes
90    /// 
91    /// # Errors
92    /// - `FeatureMatrixError::InvalidShape` if data has zero rows or columns
93    pub fn new(data: Array2<f64>) -> std::result::Result<Self, FeatureMatrixError> {
94        let (n_samples, n_features) = data.dim();
95        
96        if n_samples == 0 || n_features == 0 {
97            return Err(FeatureMatrixError::InvalidShape { 
98                rows: n_samples, 
99                cols: n_features 
100            });
101        }
102        
103        let feature_names = (0..n_features)
104            .map(|i| format!("feature_{}", i))
105            .collect();
106            
107        Ok(FeatureMatrix { data, feature_names })
108    }
109    
110    /// Creates a new feature matrix with custom feature names.
111    /// 
112    /// # Parameters
113    /// - `data`: 2D array with shape (n_samples, n_features)
114    /// - `feature_names`: Vector of names, one per column
115    /// 
116    /// # Returns
117    /// `Ok(FeatureMatrix)` if validation passes
118    /// 
119    /// # Errors
120    /// - `FeatureMatrixError::InvalidShape` if data has zero rows or columns
121    /// - `FeatureMatrixError::InvalidShape` if feature_names length doesn't match columns
122    pub fn with_feature_names(
123        data: Array2<f64>, 
124        feature_names: Vec<String>
125    ) -> std::result::Result<Self, FeatureMatrixError> {
126        let (n_samples, n_features) = data.dim();
127        
128        if n_samples == 0 || n_features == 0 {
129            return Err(FeatureMatrixError::InvalidShape { 
130                rows: n_samples, 
131                cols: n_features 
132            });
133        }
134        
135        if feature_names.len() != n_features {
136            return Err(FeatureMatrixError::InvalidShape {
137                rows: n_samples,
138                cols: n_features,
139            });
140        }
141        
142        Ok(FeatureMatrix { data, feature_names })
143    }
144    
145    /// Returns the number of samples (rows) in the matrix.
146    pub fn n_samples(&self) -> usize {
147        self.data.nrows()
148    }
149    
150    /// Returns the number of features (columns) in the matrix.
151    pub fn n_features(&self) -> usize {
152        self.data.ncols()
153    }
154    
155    /// Returns the shape as (n_samples, n_features).
156    pub fn shape(&self) -> (usize, usize) {
157        self.data.dim()
158    }
159    
160    /// Returns a reference to the underlying data array.
161    pub fn data(&self) -> &Array2<f64> {
162        &self.data
163    }
164    
165    /// Returns a slice of feature names.
166    pub fn feature_names(&self) -> &[String] {
167        &self.feature_names
168    }
169    
170    /// Gets a column view for a specific feature.
171    /// 
172    /// # Parameters
173    /// - `feature_idx`: Zero-based column index
174    /// 
175    /// # Returns
176    /// Array view of shape (n_samples,)
177    /// 
178    /// # Errors
179    /// - `FeatureMatrixError::FeatureIndexOutOfBounds` if index >= n_features
180    pub fn get_feature(&self, feature_idx: usize) -> std::result::Result<ArrayView1<'_, f64>, FeatureMatrixError> {
181        if feature_idx >= self.n_features() {
182            return Err(FeatureMatrixError::FeatureIndexOutOfBounds { 
183                index: feature_idx, 
184                max: self.n_features() - 1 
185            });
186        }
187        Ok(self.data.column(feature_idx))
188    }
189    
190    /// Gets a row view for a specific sample.
191    /// 
192    /// # Parameters
193    /// - `sample_idx`: Zero-based row index
194    /// 
195    /// # Returns
196    /// Array view of shape (n_features,)
197    /// 
198    /// # Errors
199    /// - `FeatureMatrixError::SampleIndexOutOfBounds` if index >= n_samples
200    pub fn get_sample(&self, sample_idx: usize) -> std::result::Result<ArrayView1<'_, f64>, FeatureMatrixError> {
201        if sample_idx >= self.n_samples() {
202            return Err(FeatureMatrixError::SampleIndexOutOfBounds { 
203                index: sample_idx, 
204                max: self.n_samples() - 1 
205            });
206        }
207        Ok(self.data.row(sample_idx))
208    }
209    
210    /// Gets a single element at (sample_idx, feature_idx).
211    /// 
212    /// # Parameters
213    /// - `sample_idx`: Row index
214    /// - `feature_idx`: Column index
215    /// 
216    /// # Returns
217    /// The scalar value at that position
218    /// 
219    /// # Errors
220    /// - `FeatureMatrixError::SampleIndexOutOfBounds` if sample_idx >= n_samples
221    /// - `FeatureMatrixError::FeatureIndexOutOfBounds` if feature_idx >= n_features
222    pub fn get(&self, sample_idx: usize, feature_idx: usize) -> std::result::Result<f64, FeatureMatrixError> {
223        if sample_idx >= self.n_samples() {
224            return Err(FeatureMatrixError::SampleIndexOutOfBounds { 
225                index: sample_idx, 
226                max: self.n_samples() - 1 
227            });
228        }
229        if feature_idx >= self.n_features() {
230            return Err(FeatureMatrixError::FeatureIndexOutOfBounds { 
231                index: feature_idx, 
232                max: self.n_features() - 1 
233            });
234        }
235        Ok(self.data[[sample_idx, feature_idx]])
236    }
237    
238    /// Gets a view of the entire matrix.
239    /// 
240    /// Useful for passing to ndarray-based algorithms.
241    pub fn view(&self) -> ArrayView2<'_, f64> {
242        self.data.view()
243    }
244    
245    /// Returns an iterator over samples (rows).
246    /// 
247    /// Each iteration yields a view of shape (n_features,).
248    pub fn samples(&self) -> impl Iterator<Item = ArrayView1<'_, f64>> + '_ {
249        self.data.rows().into_iter()
250    }
251    
252    /// Returns an iterator over features (columns).
253    /// 
254    /// Each iteration yields a view of shape (n_samples,).
255    /// This is cache-friendly due to column-major storage.
256    pub fn features(&self) -> impl Iterator<Item = ArrayView1<'_, f64>> + '_ {
257        self.data.columns().into_iter()
258    }
259    
260    /// Computes the min and max range for each feature.
261    /// 
262    /// Useful for decision tree split finding and feature scaling.
263    /// 
264    /// # Returns
265    /// Vector of (min, max) tuples, one per feature
266    pub fn feature_ranges(&self) -> Vec<(f64, f64)> {
267        (0..self.n_features())
268            .map(|i| {
269                let col = self.data.column(i);
270                let min = col.fold(f64::INFINITY, |a, &b| a.min(b));
271                let max = col.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
272                (min, max)
273            })
274            .collect()
275    }
276    
277    /// Computes the mean of each feature.
278    /// 
279    /// Useful for imputation and model initialization.
280    /// 
281    /// # Returns
282    /// Vector of means, one per feature
283    pub fn feature_means(&self) -> Vec<f64> {
284        (0..self.n_features())
285            .map(|i| {
286                let col = self.data.column(i);
287                col.mean().unwrap_or(0.0)
288            })
289            .collect()
290    }
291
292    /// Creates a submatrix with selected samples (rows).
293    /// 
294    /// # Parameters
295    /// - `sample_indices`: Row indices to include in the subset
296    /// 
297    /// # Returns
298    /// New FeatureMatrix containing only selected rows
299    /// 
300    /// # Errors
301    /// - `FeatureMatrixError::SampleIndexOutOfBounds` if any index >= n_samples
302    pub fn select_samples(&self, sample_indices: &[usize]) -> std::result::Result<Self, FeatureMatrixError> {
303        let mut selected_data = Vec::new();
304
305        for &idx in sample_indices {
306            if idx >= self.n_samples() {
307                return Err(FeatureMatrixError::SampleIndexOutOfBounds {
308                    index: idx,
309                    max: self.n_samples() - 1
310                });
311            }
312            selected_data.push(self.data.row(idx).to_owned());
313        }
314
315        // Stack rows into a new matrix
316        let data = ndarray::stack(
317            Axis(0),
318            &selected_data.iter().map(|v| v.view()).collect::<Vec<_>>()
319        ).map_err(|_| FeatureMatrixError::InvalidShape {
320            rows: sample_indices.len(),
321            cols: self.n_features(),
322        })?;
323
324        Ok(FeatureMatrix {
325            data,
326            feature_names: self.feature_names.clone(),
327        })
328    }
329    
330    /// Creates a submatrix with selected features (columns).
331    /// 
332    /// # Parameters
333    /// - `feature_indices`: Column indices to include in the subset
334    /// 
335    /// # Returns
336    /// New FeatureMatrix containing only selected columns
337    /// 
338    /// # Errors
339    /// - `FeatureMatrixError::FeatureIndexOutOfBounds` if any index >= n_features
340    pub fn select_features(&self, feature_indices: &[usize]) -> std::result::Result<Self, FeatureMatrixError> {
341        let mut selected_data = Vec::new();
342        let mut selected_names = Vec::new();
343        
344        for &idx in feature_indices {
345            if idx >= self.n_features() {
346                return Err(FeatureMatrixError::FeatureIndexOutOfBounds {
347                    index: idx,
348                    max: self.n_features() - 1,
349                });
350            }
351            selected_data.push(self.data.column(idx).to_owned());
352            selected_names.push(self.feature_names[idx].clone());
353        }
354        
355        // Stack columns into a new matrix
356        let data = ndarray::stack(
357            Axis(1),
358            &selected_data.iter().map(|v| v.view()).collect::<Vec<_>>()
359        ).map_err(|_| FeatureMatrixError::InvalidShape {
360            rows: self.n_samples(),
361            cols: feature_indices.len(),
362        })?;
363        
364        Ok(FeatureMatrix {
365            data,
366            feature_names: selected_names,
367        })
368    }
369    
370    /// Loads a feature matrix from a CSV file.
371    /// 
372    /// # Parameters
373    /// - `path`: Path to CSV file with headers
374    /// 
375    /// # Returns
376    /// FeatureMatrix with data and column names
377    /// 
378    /// # Errors
379    /// - `FeatureMatrixError::CsvError` if CSV parsing fails
380    /// - `FeatureMatrixError::IoError` if file cannot be read
381    /// - `FeatureMatrixError::InvalidShape` if file is empty
382    /// 
383    /// # CSV Format
384    /// - First row: column headers (become feature names)
385    /// - Subsequent rows: numeric data (NaN for missing values)
386    pub fn from_csv<P: AsRef<Path>>(path: P) -> std::result::Result<Self, FeatureMatrixError> {
387        let mut reader = csv::ReaderBuilder::new()
388            .has_headers(true)
389            .from_path(path)?;
390            
391        let headers: Vec<String> = reader
392            .headers()?
393            .iter()
394            .map(|s| s.to_string())
395            .collect();
396            
397        let mut data = Vec::new();
398        
399        for result in reader.records() {
400            let record = result?;
401            let row: Vec<f64> = record
402                .iter()
403                .map(|s| s.parse().unwrap_or(f64::NAN))
404                .collect();
405            data.push(row);
406        }
407        
408        let n_samples = data.len();
409        let n_features = headers.len();
410        
411        if n_samples == 0 {
412            return Err(FeatureMatrixError::InvalidShape { rows: 0, cols: n_features });
413        }
414        
415        let flat_data: Vec<f64> = data.into_iter().flatten().collect();
416        let array = Array2::from_shape_vec((n_samples, n_features), flat_data)
417            .map_err(|_| FeatureMatrixError::InvalidShape { rows: n_samples, cols: n_features })?;
418            
419        Ok(FeatureMatrix {
420            data: array,
421            feature_names: headers,
422        })
423    }
424}
425
426/// Equality comparison for feature matrices.
427///
428/// Two matrices are equal if they have:
429/// - Identical underlying data (element-wise)
430/// - Same feature names (in same order)
431impl PartialEq for FeatureMatrix {
432    fn eq(&self, other: &Self) -> bool {
433        self.data == other.data && self.feature_names == other.feature_names
434    }
435}
436