use crate::transforms::Transform;
use scirs2_core::random::Rng;
use std::collections::HashMap;
use std::marker::PhantomData;
use tenflowers_core::{Result, Tensor, TensorError};
pub struct PolynomialFeatures<T> {
degree: usize,
include_bias: bool,
interaction_only: bool,
_phantom: PhantomData<T>,
}
impl<T> PolynomialFeatures<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
pub fn new(degree: usize) -> Self {
Self {
degree: degree.max(1),
include_bias: true,
interaction_only: false,
_phantom: PhantomData,
}
}
pub fn with_bias(mut self, include_bias: bool) -> Self {
self.include_bias = include_bias;
self
}
pub fn interaction_only(mut self) -> Self {
self.interaction_only = true;
self
}
fn generate_combinations(&self, n_features: usize) -> Vec<Vec<usize>> {
let mut combinations = Vec::new();
if self.include_bias {
combinations.push(vec![]);
}
for degree in 1..=self.degree {
self.generate_combinations_recursive(
&mut combinations,
&mut Vec::new(),
0,
n_features,
degree,
);
}
combinations
}
fn generate_combinations_recursive(
&self,
combinations: &mut Vec<Vec<usize>>,
current: &mut Vec<usize>,
start_idx: usize,
n_features: usize,
remaining_degree: usize,
) {
if remaining_degree == 0 {
combinations.push(current.clone());
return;
}
for i in start_idx..n_features {
current.push(i);
let next_start = if self.interaction_only { i + 1 } else { i };
self.generate_combinations_recursive(
combinations,
current,
next_start,
n_features,
remaining_degree - 1,
);
current.pop();
}
}
fn compute_polynomial_feature(&self, features: &[T], combination: &[usize]) -> T {
if combination.is_empty() {
return T::one(); }
let mut result = T::one();
for &feature_idx in combination {
if feature_idx < features.len() {
result = result * features[feature_idx];
}
}
result
}
}
impl<T> Transform<T> for PolynomialFeatures<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
fn apply(&self, sample: (Tensor<T>, Tensor<T>)) -> Result<(Tensor<T>, Tensor<T>)> {
let (features, labels) = sample;
let original_shape = features.shape().dims();
let n_features = features.shape().size();
let feature_data = features.as_slice().ok_or_else(|| {
TensorError::invalid_argument(
"Cannot access tensor data (GPU tensor not supported)".to_string(),
)
})?;
let combinations = self.generate_combinations(n_features);
let mut polynomial_data = Vec::with_capacity(combinations.len());
for combination in &combinations {
let poly_value = self.compute_polynomial_feature(feature_data, combination);
polynomial_data.push(poly_value);
}
let polynomial_features = Tensor::from_vec(polynomial_data, &[combinations.len()])?;
Ok((polynomial_features, labels))
}
}
pub struct BinningTransform<T> {
n_bins: usize,
strategy: BinningStrategy,
bin_edges: Option<Vec<T>>,
_phantom: PhantomData<T>,
}
#[derive(Debug, Clone)]
pub enum BinningStrategy {
Uniform,
Quantile,
KMeans,
}
impl<T> BinningTransform<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
pub fn new(n_bins: usize, strategy: BinningStrategy) -> Self {
Self {
n_bins: n_bins.max(2),
strategy,
bin_edges: None,
_phantom: PhantomData,
}
}
pub fn uniform(n_bins: usize) -> Self {
Self::new(n_bins, BinningStrategy::Uniform)
}
pub fn quantile(n_bins: usize) -> Self {
Self::new(n_bins, BinningStrategy::Quantile)
}
pub fn with_edges(mut self, edges: Vec<T>) -> Self {
self.bin_edges = Some(edges);
self
}
pub fn fit(&mut self, data: &[T]) -> Result<()> {
if data.is_empty() {
return Err(TensorError::invalid_argument(
"Cannot fit binning transform on empty data".to_string(),
));
}
let mut sorted_data = data.to_vec();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let min_val = sorted_data[0];
let max_val = sorted_data[sorted_data.len() - 1];
let edges = match self.strategy {
BinningStrategy::Uniform => {
let mut edges = Vec::with_capacity(self.n_bins + 1);
let step = (max_val - min_val)
/ T::from(self.n_bins).expect("bin count should convert to float");
for i in 0..=self.n_bins {
edges.push(
min_val + T::from(i).expect("bin index should convert to float") * step,
);
}
edges
}
BinningStrategy::Quantile => {
let mut edges = Vec::with_capacity(self.n_bins + 1);
edges.push(min_val);
for i in 1..self.n_bins {
let quantile = i as f64 / self.n_bins as f64;
let idx = (quantile * (sorted_data.len() - 1) as f64) as usize;
edges.push(sorted_data[idx]);
}
edges.push(max_val);
edges
}
BinningStrategy::KMeans => {
let mut centers = Vec::with_capacity(self.n_bins);
let step = (max_val - min_val)
/ T::from(self.n_bins - 1).expect("bin count should convert to float");
for i in 0..self.n_bins {
centers.push(
min_val + T::from(i).expect("center index should convert to float") * step,
);
}
let mut edges = vec![min_val];
for i in 1..self.n_bins {
let midpoint = (centers[i - 1] + centers[i])
/ T::from(2.0).expect("constant 2.0 should convert to float");
edges.push(midpoint);
}
edges.push(max_val);
edges
}
};
self.bin_edges = Some(edges);
Ok(())
}
fn find_bin(&self, value: T) -> usize {
if let Some(ref edges) = self.bin_edges {
for (i, &edge) in edges.iter().enumerate().skip(1) {
if value <= edge {
return i - 1;
}
}
return edges.len() - 2; }
0 }
}
impl<T> Transform<T> for BinningTransform<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
fn apply(&self, sample: (Tensor<T>, Tensor<T>)) -> Result<(Tensor<T>, Tensor<T>)> {
let (features, labels) = sample;
if self.bin_edges.is_none() {
return Err(TensorError::invalid_argument(
"BinningTransform must be fitted before use".to_string(),
));
}
let feature_data = features.as_slice().ok_or_else(|| {
TensorError::invalid_argument(
"Cannot access tensor data (GPU tensor not supported)".to_string(),
)
})?;
let binned_data: Vec<T> = feature_data
.iter()
.map(|&val| T::from(self.find_bin(val)).expect("bin index should convert to T"))
.collect();
let binned_features = Tensor::from_vec(binned_data, features.shape().dims())?;
Ok((binned_features, labels))
}
}
pub struct OneHotEncode<T> {
categories: Option<HashMap<usize, Vec<T>>>, drop_first: bool,
_phantom: PhantomData<T>,
}
impl<T> OneHotEncode<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable
+ std::hash::Hash
+ Eq,
{
pub fn new() -> Self {
Self {
categories: None,
drop_first: false,
_phantom: PhantomData,
}
}
pub fn drop_first(mut self) -> Self {
self.drop_first = true;
self
}
pub fn fit(&mut self, data: &[T]) -> Result<()> {
let mut unique_values: Vec<T> = data.iter().cloned().collect();
unique_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
unique_values.dedup();
let mut categories = HashMap::new();
categories.insert(0, unique_values);
self.categories = Some(categories);
Ok(())
}
pub fn output_size(&self) -> usize {
if let Some(ref categories) = self.categories {
let base_size = categories.get(&0).map(|c| c.len()).unwrap_or(0);
if self.drop_first && base_size > 0 {
base_size - 1
} else {
base_size
}
} else {
0
}
}
}
impl<T> Default for OneHotEncode<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable
+ std::hash::Hash
+ Eq,
{
fn default() -> Self {
Self::new()
}
}
impl<T> Transform<T> for OneHotEncode<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable
+ std::hash::Hash
+ Eq,
{
fn apply(&self, sample: (Tensor<T>, Tensor<T>)) -> Result<(Tensor<T>, Tensor<T>)> {
let (features, labels) = sample;
if self.categories.is_none() {
return Err(TensorError::invalid_argument(
"OneHotEncode must be fitted before use".to_string(),
));
}
let feature_data = features.as_slice().ok_or_else(|| {
TensorError::invalid_argument(
"Cannot access tensor data (GPU tensor not supported)".to_string(),
)
})?;
let categories = self
.categories
.as_ref()
.expect("categories should be fitted")
.get(&0)
.expect("feature index 0 should exist");
let output_size = self.output_size();
let mut encoded_data = Vec::with_capacity(output_size * feature_data.len());
for &value in feature_data {
let mut one_hot = vec![T::zero(); output_size];
if let Some(pos) = categories.iter().position(|&cat| cat == value) {
let adjusted_pos = if self.drop_first && pos > 0 {
pos - 1
} else if self.drop_first && pos == 0 {
one_hot.len() } else {
pos
};
if adjusted_pos < one_hot.len() {
one_hot[adjusted_pos] = T::one();
}
}
encoded_data.extend(one_hot);
}
let new_shape = vec![feature_data.len(), output_size];
let encoded_features = Tensor::from_vec(encoded_data, &new_shape)?;
Ok((encoded_features, labels))
}
}
pub struct TargetEncode<T> {
category_means: Option<HashMap<T, T>>,
global_mean: Option<T>,
smoothing: f64,
_phantom: PhantomData<T>,
}
impl<T> TargetEncode<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable
+ std::hash::Hash
+ Eq,
{
pub fn new(smoothing: f64) -> Self {
Self {
category_means: None,
global_mean: None,
smoothing: smoothing.max(0.0),
_phantom: PhantomData,
}
}
pub fn fit(&mut self, features: &[T], targets: &[T]) -> Result<()> {
if features.len() != targets.len() {
return Err(TensorError::invalid_argument(
"Features and targets must have the same length".to_string(),
));
}
let global_mean = targets.iter().fold(T::zero(), |acc, &x| acc + x)
/ T::from(targets.len()).expect("target length should convert to T");
self.global_mean = Some(global_mean);
let mut category_sums: HashMap<T, (T, usize)> = HashMap::new();
for (&feature, &target) in features.iter().zip(targets.iter()) {
let entry = category_sums.entry(feature).or_insert((T::zero(), 0));
entry.0 = entry.0 + target;
entry.1 += 1;
}
let mut category_means = HashMap::new();
for (category, (sum, count)) in category_sums {
let category_mean = sum / T::from(count).expect("count should convert to T");
let smoothed_mean = if self.smoothing > 0.0 {
let alpha = T::from(self.smoothing).expect("smoothing value should convert to T");
let n = T::from(count).expect("count should convert to T");
(category_mean * n + global_mean * alpha) / (n + alpha)
} else {
category_mean
};
category_means.insert(category, smoothed_mean);
}
self.category_means = Some(category_means);
Ok(())
}
}
impl<T> Transform<T> for TargetEncode<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable
+ std::hash::Hash
+ Eq,
{
fn apply(&self, sample: (Tensor<T>, Tensor<T>)) -> Result<(Tensor<T>, Tensor<T>)> {
let (features, labels) = sample;
if self.category_means.is_none() || self.global_mean.is_none() {
return Err(TensorError::invalid_argument(
"TargetEncode must be fitted before use".to_string(),
));
}
let feature_data = features.as_slice().ok_or_else(|| {
TensorError::invalid_argument(
"Cannot access tensor data (GPU tensor not supported)".to_string(),
)
})?;
let category_means = self
.category_means
.as_ref()
.expect("category_means should be fitted");
let global_mean = self.global_mean.expect("global_mean should be fitted");
let encoded_data: Vec<T> = feature_data
.iter()
.map(|&value| category_means.get(&value).copied().unwrap_or(global_mean))
.collect();
let encoded_features = Tensor::from_vec(encoded_data, features.shape().dims())?;
Ok((encoded_features, labels))
}
}
pub struct VarianceThreshold<T> {
threshold: T,
selected_features: Option<Vec<usize>>,
_phantom: PhantomData<T>,
}
impl<T> VarianceThreshold<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
pub fn new(threshold: T) -> Self {
Self {
threshold,
selected_features: None,
_phantom: PhantomData,
}
}
pub fn fit(&mut self, data: &[Vec<T>]) -> Result<()> {
if data.is_empty() || data[0].is_empty() {
return Err(TensorError::invalid_argument(
"Cannot fit on empty data".to_string(),
));
}
let n_features = data[0].len();
let n_samples = T::from(data.len()).expect("data length should convert to T");
let mut selected = Vec::new();
for feature_idx in 0..n_features {
let mut sum = T::zero();
for sample in data {
if feature_idx < sample.len() {
sum = sum + sample[feature_idx];
}
}
let mean = sum / n_samples;
let mut variance_sum = T::zero();
for sample in data {
if feature_idx < sample.len() {
let diff = sample[feature_idx] - mean;
variance_sum = variance_sum + diff * diff;
}
}
let variance = variance_sum / n_samples;
if variance >= self.threshold {
selected.push(feature_idx);
}
}
self.selected_features = Some(selected);
Ok(())
}
pub fn get_selected_features(&self) -> &Option<Vec<usize>> {
&self.selected_features
}
}
impl<T> Transform<T> for VarianceThreshold<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
fn apply(&self, sample: (Tensor<T>, Tensor<T>)) -> Result<(Tensor<T>, Tensor<T>)> {
let (features, labels) = sample;
if self.selected_features.is_none() {
return Err(TensorError::invalid_argument(
"VarianceThreshold must be fitted before use".to_string(),
));
}
let feature_data = features.as_slice().ok_or_else(|| {
TensorError::invalid_argument(
"Cannot access tensor data (GPU tensor not supported)".to_string(),
)
})?;
let selected = self
.selected_features
.as_ref()
.expect("selected_features should be fitted");
let mut filtered_data = Vec::with_capacity(selected.len());
for &idx in selected {
if idx < feature_data.len() {
filtered_data.push(feature_data[idx]);
}
}
let filtered_features = Tensor::from_vec(filtered_data, &[selected.len()])?;
Ok((filtered_features, labels))
}
}
pub struct PowerTransform<T> {
method: PowerMethod,
fitted_lambdas: Option<Vec<T>>,
_phantom: PhantomData<T>,
}
#[derive(Debug, Clone)]
pub enum PowerMethod {
BoxCox,
YeoJohnson,
}
impl<T> PowerTransform<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
pub fn new(method: PowerMethod) -> Self {
Self {
method,
fitted_lambdas: None,
_phantom: PhantomData,
}
}
pub fn box_cox() -> Self {
Self::new(PowerMethod::BoxCox)
}
pub fn yeo_johnson() -> Self {
Self::new(PowerMethod::YeoJohnson)
}
pub fn fit(&mut self, data: &[T]) -> Result<()> {
let lambda = match self.method {
PowerMethod::BoxCox => T::from(0.5).expect("power method default should convert to T"),
PowerMethod::YeoJohnson => T::one(),
};
self.fitted_lambdas = Some(vec![lambda]);
Ok(())
}
fn box_cox_transform(&self, value: T, lambda: T) -> T {
if value <= T::zero() {
return T::zero(); }
if lambda == T::zero() {
value.ln()
} else {
let one = T::one();
(value.powf(lambda) - one) / lambda
}
}
fn yeo_johnson_transform(&self, value: T, lambda: T) -> T {
let one = T::one();
let two = T::from(2.0).expect("constant 2.0 should convert to T");
if value >= T::zero() {
if lambda == T::zero() {
(value + one).ln()
} else {
((value + one).powf(lambda) - one) / lambda
}
} else {
if lambda == two {
-((-value + one).ln())
} else {
-((((-value) + one).powf(two - lambda) - one) / (two - lambda))
}
}
}
}
impl<T> Transform<T> for PowerTransform<T>
where
T: Clone
+ Default
+ scirs2_core::numeric::Float
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
fn apply(&self, sample: (Tensor<T>, Tensor<T>)) -> Result<(Tensor<T>, Tensor<T>)> {
let (features, labels) = sample;
if self.fitted_lambdas.is_none() {
return Err(TensorError::invalid_argument(
"PowerTransform must be fitted before use".to_string(),
));
}
let feature_data = features.as_slice().ok_or_else(|| {
TensorError::invalid_argument(
"Cannot access tensor data (GPU tensor not supported)".to_string(),
)
})?;
let lambda = self
.fitted_lambdas
.as_ref()
.expect("fitted_lambdas should be fitted")[0];
let transformed_data: Vec<T> = feature_data
.iter()
.map(|&value| match self.method {
PowerMethod::BoxCox => self.box_cox_transform(value, lambda),
PowerMethod::YeoJohnson => self.yeo_johnson_transform(value, lambda),
})
.collect();
let transformed_features = Tensor::from_vec(transformed_data, features.shape().dims())?;
Ok((transformed_features, labels))
}
}