use crate::core::error::{Error, Result};
use crate::series::Series;
use crate::time_series::core::{DateTimeIndex, TimeSeries, TimeSeriesBuilder, TimeSeriesData};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct TimeSeriesPreprocessor {
pub missing_value_strategy: MissingValueStrategy,
pub outlier_detection: OutlierDetection,
pub normalization: Option<Normalization>,
pub differencing: Option<Differencing>,
pub smoothing: Option<SmoothingConfig>,
pub resampling: Option<ResamplingConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum MissingValueStrategy {
DropNA,
ForwardFill,
BackwardFill,
LinearInterpolation,
SplineInterpolation,
FillMean,
FillMedian,
FillConstant(f64),
SeasonalFill,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum OutlierDetection {
None,
ZScore { threshold: f64 },
ModifiedZScore { threshold: f64 },
IQR { multiplier: f64 },
IsolationForest { contamination: f64 },
SPC { window: usize, sigma: f64 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum OutlierTreatment {
Remove,
Cap,
ReplaceWithMedian,
ReplaceWithMean,
Interpolate,
Winsorize { percentile: f64 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Normalization {
MinMax,
MinMaxRange { min: f64, max: f64 },
ZScore,
Robust,
UnitVector,
Quantile,
BoxCox { lambda: Option<f64> },
Log { base: f64 },
Sqrt,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Differencing {
pub order: usize,
pub seasonal_order: Option<usize>,
pub seasonal_period: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SmoothingConfig {
pub method: SmoothingMethod,
pub parameters: HashMap<String, f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum SmoothingMethod {
MovingAverage { window: usize },
ExponentialSmoothing { alpha: f64 },
SavitzkyGolay { window: usize, order: usize },
Lowess { fraction: f64 },
KalmanFilter,
HodrickPrescott { lambda: f64 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResamplingConfig {
pub frequency: crate::time_series::core::Frequency,
pub aggregation: AggregationMethod,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum AggregationMethod {
Mean,
Median,
Sum,
Min,
Max,
First,
Last,
Std,
Count,
}
#[derive(Debug, Clone)]
pub struct PreprocessingResult {
pub processed_series: TimeSeries,
pub transformations: Vec<TransformationInfo>,
pub statistics: PreprocessingStatistics,
pub outlier_info: OutlierInfo,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TransformationInfo {
pub transformation_type: String,
pub parameters: HashMap<String, f64>,
pub affected_values: usize,
pub order: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PreprocessingStatistics {
pub original_length: usize,
pub final_length: usize,
pub missing_values_handled: usize,
pub outliers_detected: usize,
pub original_range: (f64, f64),
pub final_range: (f64, f64),
pub mean_before_after: (f64, f64),
pub std_before_after: (f64, f64),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutlierInfo {
pub outlier_indices: Vec<usize>,
pub outlier_values: Vec<f64>,
pub detection_method: String,
pub treatment_method: String,
pub threshold: f64,
}
impl Default for TimeSeriesPreprocessor {
fn default() -> Self {
Self::new()
}
}
impl TimeSeriesPreprocessor {
pub fn new() -> Self {
Self {
missing_value_strategy: MissingValueStrategy::LinearInterpolation,
outlier_detection: OutlierDetection::ModifiedZScore { threshold: 3.5 },
normalization: None,
differencing: None,
smoothing: None,
resampling: None,
}
}
pub fn with_missing_value_strategy(mut self, strategy: MissingValueStrategy) -> Self {
self.missing_value_strategy = strategy;
self
}
pub fn with_outlier_detection(mut self, detection: OutlierDetection) -> Self {
self.outlier_detection = detection;
self
}
pub fn with_normalization(mut self, normalization: Normalization) -> Self {
self.normalization = Some(normalization);
self
}
pub fn with_differencing(mut self, differencing: Differencing) -> Self {
self.differencing = Some(differencing);
self
}
pub fn with_smoothing(mut self, smoothing: SmoothingConfig) -> Self {
self.smoothing = Some(smoothing);
self
}
pub fn with_resampling(mut self, resampling: ResamplingConfig) -> Self {
self.resampling = Some(resampling);
self
}
pub fn preprocess(&self, ts: &TimeSeries) -> Result<PreprocessingResult> {
let mut processed_series = ts.clone();
let mut transformations = Vec::new();
let original_stats = self.calculate_basic_stats(&processed_series)?;
let original_length = processed_series.len();
let (series_after_missing, missing_transform) =
self.handle_missing_values(&processed_series)?;
processed_series = series_after_missing;
if let Some(transform) = missing_transform {
transformations.push(transform);
}
if let Some(resampling_config) = &self.resampling {
let (resampled_series, resample_transform) =
self.resample_series(&processed_series, resampling_config)?;
processed_series = resampled_series;
transformations.push(resample_transform);
}
let (series_after_outliers, outlier_transform, outlier_detection_info) =
self.handle_outliers(&processed_series)?;
processed_series = series_after_outliers;
if let Some(transform) = outlier_transform {
transformations.push(transform);
}
let outlier_info = outlier_detection_info;
if let Some(smoothing_config) = &self.smoothing {
let (smoothed_series, smooth_transform) =
self.apply_smoothing(&processed_series, smoothing_config)?;
processed_series = smoothed_series;
transformations.push(smooth_transform);
}
if let Some(differencing_config) = &self.differencing {
let (differenced_series, diff_transform) =
self.apply_differencing(&processed_series, differencing_config)?;
processed_series = differenced_series;
transformations.push(diff_transform);
}
if let Some(normalization_method) = &self.normalization {
let (normalized_series, norm_transform) =
self.apply_normalization(&processed_series, normalization_method)?;
processed_series = normalized_series;
transformations.push(norm_transform);
}
let final_stats = self.calculate_basic_stats(&processed_series)?;
let final_length = processed_series.len();
let statistics = PreprocessingStatistics {
original_length,
final_length,
missing_values_handled: transformations
.iter()
.find(|t| t.transformation_type.contains("missing"))
.map(|t| t.affected_values)
.unwrap_or(0),
outliers_detected: outlier_info.outlier_indices.len(),
original_range: (original_stats.0, original_stats.1),
final_range: (final_stats.0, final_stats.1),
mean_before_after: (original_stats.2, final_stats.2),
std_before_after: (original_stats.3, final_stats.3),
};
Ok(PreprocessingResult {
processed_series,
transformations,
statistics,
outlier_info,
})
}
fn handle_missing_values(
&self,
ts: &TimeSeries,
) -> Result<(TimeSeries, Option<TransformationInfo>)> {
let missing_count = self.count_missing_values(ts);
if missing_count == 0 {
return Ok((ts.clone(), None));
}
let processed_series = match &self.missing_value_strategy {
MissingValueStrategy::DropNA => self.drop_missing_values(ts)?,
MissingValueStrategy::ForwardFill => ts.fillna_forward()?,
MissingValueStrategy::BackwardFill => ts.fillna_backward()?,
MissingValueStrategy::LinearInterpolation => self.linear_interpolation(ts)?,
MissingValueStrategy::SplineInterpolation => self.spline_interpolation(ts)?,
MissingValueStrategy::FillMean => self.fill_with_mean(ts)?,
MissingValueStrategy::FillMedian => self.fill_with_median(ts)?,
MissingValueStrategy::FillConstant(value) => self.fill_with_constant(ts, *value)?,
MissingValueStrategy::SeasonalFill => self.seasonal_fill(ts)?,
};
let transform_info = TransformationInfo {
transformation_type: format!("missing_value_{:?}", self.missing_value_strategy),
parameters: HashMap::new(),
affected_values: missing_count,
order: 1,
};
Ok((processed_series, Some(transform_info)))
}
fn handle_outliers(
&self,
ts: &TimeSeries,
) -> Result<(TimeSeries, Option<TransformationInfo>, OutlierInfo)> {
let outlier_indices = match &self.outlier_detection {
OutlierDetection::None => Vec::new(),
OutlierDetection::ZScore { threshold } => {
self.detect_outliers_zscore(ts, *threshold)?
}
OutlierDetection::ModifiedZScore { threshold } => {
self.detect_outliers_modified_zscore(ts, *threshold)?
}
OutlierDetection::IQR { multiplier } => self.detect_outliers_iqr(ts, *multiplier)?,
OutlierDetection::IsolationForest { contamination } => {
self.detect_outliers_isolation_forest(ts, *contamination)?
}
OutlierDetection::SPC { window, sigma } => {
self.detect_outliers_spc(ts, *window, *sigma)?
}
};
let outlier_values: Vec<f64> = outlier_indices
.iter()
.filter_map(|&idx| ts.values.get_f64(idx))
.collect();
let outlier_info = OutlierInfo {
outlier_indices: outlier_indices.clone(),
outlier_values,
detection_method: format!("{:?}", self.outlier_detection),
treatment_method: "Remove".to_string(), threshold: self.get_outlier_threshold(),
};
if outlier_indices.is_empty() {
return Ok((ts.clone(), None, outlier_info));
}
let processed_series = self.remove_outliers(ts, &outlier_indices)?;
let transform_info = TransformationInfo {
transformation_type: "outlier_removal".to_string(),
parameters: HashMap::new(),
affected_values: outlier_indices.len(),
order: 2,
};
Ok((processed_series, Some(transform_info), outlier_info))
}
fn apply_normalization(
&self,
ts: &TimeSeries,
method: &Normalization,
) -> Result<(TimeSeries, TransformationInfo)> {
let mut valid_values = Vec::new();
let mut valid_indices = Vec::new();
for i in 0..ts.len() {
if let Some(value) = ts.values.get_f64(i) {
if value.is_finite() {
valid_values.push(value);
if let Some(timestamp) = ts.index.get(i) {
valid_indices.push(*timestamp);
}
}
}
}
let normalized_values = match method {
Normalization::MinMax => self.minmax_normalize(&valid_values, 0.0, 1.0)?,
Normalization::MinMaxRange { min, max } => {
self.minmax_normalize(&valid_values, *min, *max)?
}
Normalization::ZScore => self.zscore_normalize(&valid_values)?,
Normalization::Robust => self.robust_normalize(&valid_values)?,
Normalization::UnitVector => self.unit_vector_normalize(&valid_values)?,
Normalization::Quantile => self.quantile_normalize(&valid_values)?,
Normalization::BoxCox { lambda } => self.boxcox_transform(&valid_values, *lambda)?,
Normalization::Log { base } => self.log_transform(&valid_values, *base)?,
Normalization::Sqrt => self.sqrt_transform(&valid_values)?,
};
let new_index = DateTimeIndex::new(valid_indices);
let normalized_series = TimeSeriesData::from_vec(normalized_values);
let processed_ts = TimeSeries::new(new_index, normalized_series)?;
let mut parameters = HashMap::new();
match method {
Normalization::MinMaxRange { min, max } => {
parameters.insert("min".to_string(), *min);
parameters.insert("max".to_string(), *max);
}
Normalization::Log { base } => {
parameters.insert("base".to_string(), *base);
}
Normalization::BoxCox { lambda } => {
if let Some(l) = lambda {
parameters.insert("lambda".to_string(), *l);
}
}
_ => {}
}
let transform_info = TransformationInfo {
transformation_type: format!("normalization_{:?}", method),
parameters,
affected_values: valid_values.len(),
order: 6,
};
Ok((processed_ts, transform_info))
}
fn apply_differencing(
&self,
ts: &TimeSeries,
config: &Differencing,
) -> Result<(TimeSeries, TransformationInfo)> {
let mut processed_series = ts.clone();
for _ in 0..config.order {
processed_series = processed_series.diff(1)?;
}
if let (Some(seasonal_order), Some(seasonal_period)) =
(config.seasonal_order, config.seasonal_period)
{
for _ in 0..seasonal_order {
processed_series = processed_series.diff(seasonal_period)?;
}
}
let mut parameters = HashMap::new();
parameters.insert("order".to_string(), config.order as f64);
if let Some(seasonal_order) = config.seasonal_order {
parameters.insert("seasonal_order".to_string(), seasonal_order as f64);
}
if let Some(seasonal_period) = config.seasonal_period {
parameters.insert("seasonal_period".to_string(), seasonal_period as f64);
}
let transform_info = TransformationInfo {
transformation_type: "differencing".to_string(),
parameters,
affected_values: ts.len(),
order: 5,
};
Ok((processed_series, transform_info))
}
fn apply_smoothing(
&self,
ts: &TimeSeries,
config: &SmoothingConfig,
) -> Result<(TimeSeries, TransformationInfo)> {
let smoothed_series = match &config.method {
SmoothingMethod::MovingAverage { window } => self.moving_average_smooth(ts, *window)?,
SmoothingMethod::ExponentialSmoothing { alpha } => {
self.exponential_smooth(ts, *alpha)?
}
SmoothingMethod::SavitzkyGolay { window, order } => {
self.savitzky_golay_smooth(ts, *window, *order)?
}
SmoothingMethod::Lowess { fraction } => self.lowess_smooth(ts, *fraction)?,
SmoothingMethod::KalmanFilter => self.kalman_smooth(ts)?,
SmoothingMethod::HodrickPrescott { lambda } => {
self.hodrick_prescott_smooth(ts, *lambda)?
}
};
let transform_info = TransformationInfo {
transformation_type: format!("smoothing_{:?}", config.method),
parameters: config.parameters.clone(),
affected_values: ts.len(),
order: 4,
};
Ok((smoothed_series, transform_info))
}
fn resample_series(
&self,
ts: &TimeSeries,
config: &ResamplingConfig,
) -> Result<(TimeSeries, TransformationInfo)> {
let resampled = ts.resample(
config.frequency.clone(),
crate::time_series::core::ResampleMethod::Mean,
)?;
let mut parameters = HashMap::new();
parameters.insert("frequency".to_string(), 0.0);
let transform_info = TransformationInfo {
transformation_type: "resampling".to_string(),
parameters,
affected_values: ts.len(),
order: 2,
};
Ok((resampled, transform_info))
}
fn count_missing_values(&self, ts: &TimeSeries) -> usize {
(0..ts.len())
.map(|i| ts.values.get_f64(i).unwrap_or(f64::NAN))
.filter(|v| !v.is_finite())
.count()
}
fn drop_missing_values(&self, ts: &TimeSeries) -> Result<TimeSeries> {
let mut valid_indices = Vec::new();
for i in 0..ts.len() {
if let Some(val) = ts.values.get_f64(i) {
if val.is_finite() {
valid_indices.push(i);
}
}
}
if valid_indices.is_empty() {
return Err(Error::InvalidInput(
"No valid values remaining after dropping NAs".to_string(),
));
}
let new_timestamps: Vec<_> = valid_indices
.iter()
.filter_map(|&i| ts.index.get(i))
.cloned()
.collect();
let new_values: Vec<f64> = valid_indices
.iter()
.filter_map(|&i| ts.values.get_f64(i))
.collect();
TimeSeries::from_vecs(new_timestamps, new_values)
}
fn linear_interpolation(&self, ts: &TimeSeries) -> Result<TimeSeries> {
let mut interpolated_values = Vec::new();
for i in 0..ts.len() {
if let Some(val) = ts.values.get_f64(i) {
if val.is_finite() {
interpolated_values.push(val);
} else {
let prev_valid = (0..i)
.rev()
.find_map(|j| ts.values.get_f64(j).filter(|v| v.is_finite()));
let next_valid = ((i + 1)..ts.len())
.find_map(|j| ts.values.get_f64(j).filter(|v| v.is_finite()));
let interpolated = match (prev_valid, next_valid) {
(Some(prev), Some(next)) => {
(prev + next) / 2.0
}
(Some(prev), None) => prev,
(None, Some(next)) => next,
(None, None) => 0.0, };
interpolated_values.push(interpolated);
}
} else {
interpolated_values.push(0.0); }
}
let interpolated_series = TimeSeriesData::from_vec(interpolated_values);
TimeSeries::new(ts.index.clone(), interpolated_series)
}
fn spline_interpolation(&self, ts: &TimeSeries) -> Result<TimeSeries> {
self.linear_interpolation(ts) }
fn fill_with_mean(&self, ts: &TimeSeries) -> Result<TimeSeries> {
let valid_values: Vec<f64> = (0..ts.len())
.filter_map(|i| ts.values.get_f64(i))
.filter(|v| v.is_finite())
.collect();
if valid_values.is_empty() {
return Err(Error::InvalidInput(
"No valid values to calculate mean".to_string(),
));
}
let mean = valid_values.iter().sum::<f64>() / valid_values.len() as f64;
self.fill_with_constant(ts, mean)
}
fn fill_with_median(&self, ts: &TimeSeries) -> Result<TimeSeries> {
let mut valid_values: Vec<f64> = (0..ts.len())
.filter_map(|i| ts.values.get_f64(i))
.filter(|v| v.is_finite())
.collect();
if valid_values.is_empty() {
return Err(Error::InvalidInput(
"No valid values to calculate median".to_string(),
));
}
valid_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if valid_values.len() % 2 == 0 {
(valid_values[valid_values.len() / 2 - 1] + valid_values[valid_values.len() / 2]) / 2.0
} else {
valid_values[valid_values.len() / 2]
};
self.fill_with_constant(ts, median)
}
fn fill_with_constant(&self, ts: &TimeSeries, value: f64) -> Result<TimeSeries> {
let filled_values: Vec<f64> = (0..ts.len())
.map(|i| {
if let Some(val) = ts.values.get_f64(i) {
if val.is_finite() {
val
} else {
value
}
} else {
value
}
})
.collect();
let filled_series = TimeSeriesData::from_vec(filled_values);
TimeSeries::new(ts.index.clone(), filled_series)
}
fn seasonal_fill(&self, ts: &TimeSeries) -> Result<TimeSeries> {
ts.fillna_forward()
}
fn detect_outliers_zscore(&self, ts: &TimeSeries, threshold: f64) -> Result<Vec<usize>> {
let values: Vec<f64> = (0..ts.len())
.filter_map(|i| ts.values.get_f64(i))
.filter(|v| v.is_finite())
.collect();
let mean = values.iter().sum::<f64>() / values.len() as f64;
let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / values.len() as f64;
let std = variance.sqrt();
let mut outliers = Vec::new();
for i in 0..ts.len() {
if let Some(val) = ts.values.get_f64(i) {
if val.is_finite() {
let z_score = if std > 0.0 {
(val - mean).abs() / std
} else {
0.0
};
if z_score > threshold {
outliers.push(i);
}
}
}
}
Ok(outliers)
}
fn detect_outliers_modified_zscore(
&self,
ts: &TimeSeries,
threshold: f64,
) -> Result<Vec<usize>> {
let values: Vec<f64> = (0..ts.len())
.filter_map(|i| ts.values.get_f64(i))
.filter(|v| v.is_finite())
.collect();
let mut sorted = values.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if sorted.len() % 2 == 0 {
(sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
} else {
sorted[sorted.len() / 2]
};
let deviations: Vec<f64> = values.iter().map(|&x| (x - median).abs()).collect();
let mut sorted_deviations = deviations;
sorted_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mad = if sorted_deviations.len() % 2 == 0 {
(sorted_deviations[sorted_deviations.len() / 2 - 1]
+ sorted_deviations[sorted_deviations.len() / 2])
/ 2.0
} else {
sorted_deviations[sorted_deviations.len() / 2]
};
let mut outliers = Vec::new();
for i in 0..ts.len() {
if let Some(val) = ts.values.get_f64(i) {
if val.is_finite() {
let modified_z = if mad > 0.0 {
0.6745 * (val - median).abs() / mad
} else {
0.0
};
if modified_z > threshold {
outliers.push(i);
}
}
}
}
Ok(outliers)
}
fn detect_outliers_iqr(&self, ts: &TimeSeries, multiplier: f64) -> Result<Vec<usize>> {
let values: Vec<f64> = (0..ts.len())
.filter_map(|i| ts.values.get_f64(i))
.filter(|v| v.is_finite())
.collect();
let mut sorted = values;
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
let q1 = sorted[n / 4];
let q3 = sorted[3 * n / 4];
let iqr = q3 - q1;
let lower_bound = q1 - multiplier * iqr;
let upper_bound = q3 + multiplier * iqr;
let mut outliers = Vec::new();
for i in 0..ts.len() {
if let Some(val) = ts.values.get_f64(i) {
if val.is_finite() && (val < lower_bound || val > upper_bound) {
outliers.push(i);
}
}
}
Ok(outliers)
}
fn detect_outliers_isolation_forest(
&self,
_ts: &TimeSeries,
_contamination: f64,
) -> Result<Vec<usize>> {
Ok(Vec::new())
}
fn detect_outliers_spc(
&self,
ts: &TimeSeries,
window: usize,
sigma: f64,
) -> Result<Vec<usize>> {
let mut outliers = Vec::new();
for i in window..ts.len() {
let window_values: Vec<f64> = (i.saturating_sub(window)..i)
.filter_map(|j| ts.values.get_f64(j))
.filter(|v| v.is_finite())
.collect();
if window_values.len() >= 3 {
let mean = window_values.iter().sum::<f64>() / window_values.len() as f64;
let variance = window_values
.iter()
.map(|x| (x - mean).powi(2))
.sum::<f64>()
/ window_values.len() as f64;
let std = variance.sqrt();
if let Some(current_val) = ts.values.get_f64(i) {
if current_val.is_finite() {
let z_score = if std > 0.0 {
(current_val - mean).abs() / std
} else {
0.0
};
if z_score > sigma {
outliers.push(i);
}
}
}
}
}
Ok(outliers)
}
fn remove_outliers(&self, ts: &TimeSeries, outlier_indices: &[usize]) -> Result<TimeSeries> {
let mut valid_indices = Vec::new();
for i in 0..ts.len() {
if !outlier_indices.contains(&i) {
valid_indices.push(i);
}
}
if valid_indices.is_empty() {
return Err(Error::InvalidInput(
"No valid values remaining after outlier removal".to_string(),
));
}
let new_timestamps: Vec<_> = valid_indices
.iter()
.filter_map(|&i| ts.index.get(i))
.cloned()
.collect();
let new_values: Vec<f64> = valid_indices
.iter()
.filter_map(|&i| ts.values.get_f64(i))
.collect();
TimeSeries::from_vecs(new_timestamps, new_values)
}
fn get_outlier_threshold(&self) -> f64 {
match &self.outlier_detection {
OutlierDetection::ZScore { threshold } => *threshold,
OutlierDetection::ModifiedZScore { threshold } => *threshold,
OutlierDetection::IQR { multiplier } => *multiplier,
OutlierDetection::IsolationForest { contamination } => *contamination,
OutlierDetection::SPC { sigma, .. } => *sigma,
_ => 0.0,
}
}
fn minmax_normalize(
&self,
values: &[f64],
target_min: f64,
target_max: f64,
) -> Result<Vec<f64>> {
let min_val = values.iter().cloned().fold(f64::INFINITY, f64::min);
let max_val = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
if min_val == max_val {
return Ok(vec![target_min; values.len()]);
}
let range = max_val - min_val;
let target_range = target_max - target_min;
Ok(values
.iter()
.map(|&x| target_min + (x - min_val) / range * target_range)
.collect())
}
fn zscore_normalize(&self, values: &[f64]) -> Result<Vec<f64>> {
let mean = values.iter().sum::<f64>() / values.len() as f64;
let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / values.len() as f64;
let std = variance.sqrt();
if std == 0.0 {
return Ok(vec![0.0; values.len()]);
}
Ok(values.iter().map(|&x| (x - mean) / std).collect())
}
fn robust_normalize(&self, values: &[f64]) -> Result<Vec<f64>> {
let mut sorted = values.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if sorted.len() % 2 == 0 {
(sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
} else {
sorted[sorted.len() / 2]
};
let deviations: Vec<f64> = values.iter().map(|&x| (x - median).abs()).collect();
let mut sorted_deviations = deviations;
sorted_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mad = if sorted_deviations.len() % 2 == 0 {
(sorted_deviations[sorted_deviations.len() / 2 - 1]
+ sorted_deviations[sorted_deviations.len() / 2])
/ 2.0
} else {
sorted_deviations[sorted_deviations.len() / 2]
};
if mad == 0.0 {
return Ok(vec![0.0; values.len()]);
}
Ok(values.iter().map(|&x| (x - median) / mad).collect())
}
fn unit_vector_normalize(&self, values: &[f64]) -> Result<Vec<f64>> {
let norm = values.iter().map(|x| x * x).sum::<f64>().sqrt();
if norm == 0.0 {
return Ok(vec![0.0; values.len()]);
}
Ok(values.iter().map(|&x| x / norm).collect())
}
fn quantile_normalize(&self, values: &[f64]) -> Result<Vec<f64>> {
let mut sorted_with_indices: Vec<(f64, usize)> = values
.iter()
.enumerate()
.map(|(i, &val)| (val, i))
.collect();
sorted_with_indices
.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let n = values.len() as f64;
let mut normalized = vec![0.0; values.len()];
for (rank, &(_, original_idx)) in sorted_with_indices.iter().enumerate() {
normalized[original_idx] = rank as f64 / (n - 1.0);
}
Ok(normalized)
}
fn boxcox_transform(&self, values: &[f64], lambda: Option<f64>) -> Result<Vec<f64>> {
if values.iter().any(|&x| x <= 0.0) {
return Err(Error::InvalidInput(
"Box-Cox transformation requires positive values".to_string(),
));
}
let lambda = lambda.unwrap_or(0.0);
if lambda == 0.0 {
Ok(values.iter().map(|&x| x.ln()).collect())
} else {
Ok(values
.iter()
.map(|&x| (x.powf(lambda) - 1.0) / lambda)
.collect())
}
}
fn log_transform(&self, values: &[f64], base: f64) -> Result<Vec<f64>> {
if values.iter().any(|&x| x <= 0.0) {
return Err(Error::InvalidInput(
"Log transformation requires positive values".to_string(),
));
}
let log_base = base.ln();
Ok(values.iter().map(|&x| x.ln() / log_base).collect())
}
fn sqrt_transform(&self, values: &[f64]) -> Result<Vec<f64>> {
if values.iter().any(|&x| x < 0.0) {
return Err(Error::InvalidInput(
"Square root transformation requires non-negative values".to_string(),
));
}
Ok(values.iter().map(|&x| x.sqrt()).collect())
}
fn moving_average_smooth(&self, ts: &TimeSeries, window: usize) -> Result<TimeSeries> {
ts.rolling_mean(window)
}
fn exponential_smooth(&self, ts: &TimeSeries, alpha: f64) -> Result<TimeSeries> {
let mut smoothed_values = Vec::with_capacity(ts.len());
if let Some(first_val) = ts.values.get_f64(0) {
smoothed_values.push(first_val);
for i in 1..ts.len() {
if let Some(current_val) = ts.values.get_f64(i) {
let prev_smooth = smoothed_values[i - 1];
let new_smooth = alpha * current_val + (1.0 - alpha) * prev_smooth;
smoothed_values.push(new_smooth);
} else {
smoothed_values.push(smoothed_values[i - 1]);
}
}
}
let smoothed_series = TimeSeriesData::from_vec(smoothed_values);
TimeSeries::new(ts.index.clone(), smoothed_series)
}
fn savitzky_golay_smooth(
&self,
ts: &TimeSeries,
_window: usize,
_order: usize,
) -> Result<TimeSeries> {
self.moving_average_smooth(ts, _window)
}
fn lowess_smooth(&self, ts: &TimeSeries, _fraction: f64) -> Result<TimeSeries> {
self.moving_average_smooth(ts, 5)
}
fn kalman_smooth(&self, ts: &TimeSeries) -> Result<TimeSeries> {
self.exponential_smooth(ts, 0.3)
}
fn hodrick_prescott_smooth(&self, ts: &TimeSeries, _lambda: f64) -> Result<TimeSeries> {
self.moving_average_smooth(ts, 10)
}
fn calculate_basic_stats(&self, ts: &TimeSeries) -> Result<(f64, f64, f64, f64)> {
let values: Vec<f64> = (0..ts.len())
.filter_map(|i| ts.values.get_f64(i))
.filter(|v| v.is_finite())
.collect();
if values.is_empty() {
return Ok((0.0, 0.0, 0.0, 0.0));
}
let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
let mean = values.iter().sum::<f64>() / values.len() as f64;
let variance = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / values.len() as f64;
let std = variance.sqrt();
Ok((min, max, mean, std))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::time_series::core::{Frequency, TimeSeriesBuilder};
use chrono::{TimeZone, Utc};
fn create_test_series_with_outliers() -> TimeSeries {
let mut builder = TimeSeriesBuilder::new();
for i in 0..50 {
let timestamp = Utc
.timestamp_opt(1640995200 + (i * 86400) as i64, 0)
.single()
.expect("timestamp should be unambiguous");
let value = if i == 10 || i == 30 {
100.0 } else {
10.0 + (i as f64 * 0.1).sin()
};
builder = builder.add_point(timestamp, value);
}
builder
.frequency(Frequency::Daily)
.build()
.expect("operation should succeed")
}
#[test]
fn test_missing_value_handling() {
let ts = create_test_series_with_outliers();
let preprocessor = TimeSeriesPreprocessor::new()
.with_missing_value_strategy(MissingValueStrategy::LinearInterpolation)
.with_outlier_detection(OutlierDetection::None);
let result = preprocessor
.preprocess(&ts)
.expect("operation should succeed");
assert_eq!(result.processed_series.len(), ts.len());
}
#[test]
fn test_outlier_detection() {
let ts = create_test_series_with_outliers();
let preprocessor = TimeSeriesPreprocessor::new()
.with_outlier_detection(OutlierDetection::ModifiedZScore { threshold: 2.0 });
let result = preprocessor
.preprocess(&ts)
.expect("operation should succeed");
assert!(result.outlier_info.outlier_indices.len() > 0);
assert!(result.processed_series.len() < ts.len()); }
#[test]
fn test_normalization() {
let ts = create_test_series_with_outliers();
let preprocessor = TimeSeriesPreprocessor::new()
.with_normalization(Normalization::MinMax)
.with_outlier_detection(OutlierDetection::None);
let result = preprocessor
.preprocess(&ts)
.expect("operation should succeed");
let values: Vec<f64> = (0..result.processed_series.len())
.filter_map(|i| result.processed_series.values.get_f64(i))
.collect();
let min_val = values.iter().cloned().fold(f64::INFINITY, f64::min);
let max_val = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
assert!(min_val >= -0.01); assert!(max_val <= 1.01);
}
#[test]
fn test_differencing() {
let ts = create_test_series_with_outliers();
let preprocessor = TimeSeriesPreprocessor::new()
.with_differencing(Differencing {
order: 1,
seasonal_order: None,
seasonal_period: None,
})
.with_outlier_detection(OutlierDetection::None);
let result = preprocessor
.preprocess(&ts)
.expect("operation should succeed");
assert!(result.processed_series.len() <= ts.len());
assert!(result
.transformations
.iter()
.any(|t| t.transformation_type.contains("differencing")));
}
#[test]
fn test_smoothing() {
let ts = create_test_series_with_outliers();
let preprocessor = TimeSeriesPreprocessor::new()
.with_smoothing(SmoothingConfig {
method: SmoothingMethod::MovingAverage { window: 5 },
parameters: HashMap::new(),
})
.with_outlier_detection(OutlierDetection::None);
let result = preprocessor
.preprocess(&ts)
.expect("operation should succeed");
assert_eq!(result.processed_series.len(), ts.len());
assert!(result
.transformations
.iter()
.any(|t| t.transformation_type.contains("smoothing")));
}
#[test]
fn test_comprehensive_preprocessing() {
let ts = create_test_series_with_outliers();
let preprocessor = TimeSeriesPreprocessor::new()
.with_missing_value_strategy(MissingValueStrategy::LinearInterpolation)
.with_outlier_detection(OutlierDetection::ModifiedZScore { threshold: 2.0 })
.with_smoothing(SmoothingConfig {
method: SmoothingMethod::MovingAverage { window: 3 },
parameters: HashMap::new(),
})
.with_normalization(Normalization::ZScore);
let result = preprocessor
.preprocess(&ts)
.expect("operation should succeed");
assert!(result.transformations.len() > 1);
assert!(result.statistics.outliers_detected > 0);
assert!(result.statistics.final_length <= result.statistics.original_length);
}
#[test]
fn test_zscore_normalization() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let preprocessor = TimeSeriesPreprocessor::new();
let normalized = preprocessor
.zscore_normalize(&values)
.expect("operation should succeed");
let mean = normalized.iter().sum::<f64>() / normalized.len() as f64;
let variance =
normalized.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / normalized.len() as f64;
let std = variance.sqrt();
assert!((mean.abs()) < 1e-10);
assert!((std - 1.0).abs() < 1e-10);
}
#[test]
fn test_outlier_detection_methods() {
let mut values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
values.push(100.0);
let mut builder = TimeSeriesBuilder::new();
for (i, &val) in values.iter().enumerate() {
let timestamp = Utc
.timestamp_opt(1640995200 + (i * 86400) as i64, 0)
.single()
.expect("timestamp should be unambiguous");
builder = builder.add_point(timestamp, val);
}
let ts = builder.build().expect("operation should succeed");
let preprocessor = TimeSeriesPreprocessor::new();
let zscore_outliers = preprocessor
.detect_outliers_zscore(&ts, 2.0)
.expect("operation should succeed");
assert!(!zscore_outliers.is_empty());
let modified_outliers = preprocessor
.detect_outliers_modified_zscore(&ts, 2.0)
.expect("operation should succeed");
assert!(!modified_outliers.is_empty());
let iqr_outliers = preprocessor
.detect_outliers_iqr(&ts, 1.5)
.expect("operation should succeed");
assert!(!iqr_outliers.is_empty());
}
}