1use crate::error::{MLError, Result};
4use scirs2_core::ndarray::{Array1, Array2};
5use serde::{Deserialize, Serialize};
6use std::f64::consts::PI;
7
8pub fn generate_synthetic_time_series(
10 length: usize,
11 seasonality: Option<usize>,
12 trend: f64,
13 noise: f64,
14) -> Array2<f64> {
15 let mut data = Array2::zeros((length, 1));
16
17 for i in 0..length {
18 let t = i as f64;
19
20 let trend_val = trend * t;
22
23 let seasonal_val = if let Some(period) = seasonality {
25 10.0 * (2.0 * PI * t / period as f64).sin()
26 } else {
27 0.0
28 };
29
30 let noise_val = noise * (fastrand::f64() - 0.5);
32
33 data[[i, 0]] = 50.0 + trend_val + seasonal_val + noise_val;
34 }
35
36 data
37}
38
39pub fn generate_complex_time_series(
41 length: usize,
42 config: SyntheticDataConfig,
43) -> Result<Array2<f64>> {
44 let mut data = Array2::zeros((length, config.num_features));
45
46 for feature_idx in 0..config.num_features {
47 for i in 0..length {
48 let t = i as f64;
49 let mut value = config.base_level;
50
51 for trend in &config.trends {
53 value += trend.apply(t);
54 }
55
56 for seasonal in &config.seasonalities {
58 value += seasonal.apply(t);
59 }
60
61 value += config.noise_level * (fastrand::f64() - 0.5);
63
64 let feature_factor = 1.0 + 0.1 * feature_idx as f64;
66 value *= feature_factor;
67
68 data[[i, feature_idx]] = value;
69 }
70 }
71
72 if config.feature_correlations {
74 data = add_feature_correlations(data, 0.3)?;
75 }
76
77 if config.outlier_probability > 0.0 {
79 data = add_outliers(data, config.outlier_probability, config.outlier_magnitude)?;
80 }
81
82 Ok(data)
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SyntheticDataConfig {
88 pub num_features: usize,
90
91 pub base_level: f64,
93
94 pub trends: Vec<TrendComponent>,
96
97 pub seasonalities: Vec<SeasonalComponent>,
99
100 pub noise_level: f64,
102
103 pub feature_correlations: bool,
105
106 pub outlier_probability: f64,
108
109 pub outlier_magnitude: f64,
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct TrendComponent {
116 pub trend_type: TrendType,
118
119 pub magnitude: f64,
121
122 pub parameters: Vec<f64>,
124}
125
126#[derive(Debug, Clone, Serialize, Deserialize)]
128pub enum TrendType {
129 Linear,
130 Quadratic,
131 Exponential,
132 Logarithmic,
133 Sinusoidal,
134 Custom(String),
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct SeasonalComponent {
140 pub period: usize,
142
143 pub amplitude: f64,
145
146 pub phase: f64,
148
149 pub pattern_type: SeasonalPattern,
151}
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
155pub enum SeasonalPattern {
156 Sinusoidal,
157 Triangular,
158 Square,
159 Sawtooth,
160 Custom(Vec<f64>),
161}
162
163impl TrendComponent {
164 pub fn apply(&self, t: f64) -> f64 {
166 match &self.trend_type {
167 TrendType::Linear => self.magnitude * t,
168 TrendType::Quadratic => self.magnitude * t * t,
169 TrendType::Exponential => {
170 let rate = self.parameters.get(0).copied().unwrap_or(0.01);
171 self.magnitude * (rate * t).exp()
172 }
173 TrendType::Logarithmic => {
174 if t > 0.0 {
175 self.magnitude * t.ln()
176 } else {
177 0.0
178 }
179 }
180 TrendType::Sinusoidal => {
181 let frequency = self.parameters.get(0).copied().unwrap_or(0.1);
182 self.magnitude * (2.0 * PI * frequency * t).sin()
183 }
184 TrendType::Custom(_) => {
185 self.magnitude * t
187 }
188 }
189 }
190}
191
192impl SeasonalComponent {
193 pub fn apply(&self, t: f64) -> f64 {
195 let normalized_time = (t % self.period as f64) / self.period as f64;
196 let phase_adjusted_time = (normalized_time + self.phase / (2.0 * PI)) % 1.0;
197
198 match &self.pattern_type {
199 SeasonalPattern::Sinusoidal => self.amplitude * (2.0 * PI * phase_adjusted_time).sin(),
200 SeasonalPattern::Triangular => {
201 let triangle_val = if phase_adjusted_time < 0.5 {
202 4.0 * phase_adjusted_time - 1.0
203 } else {
204 3.0 - 4.0 * phase_adjusted_time
205 };
206 self.amplitude * triangle_val
207 }
208 SeasonalPattern::Square => {
209 let square_val = if phase_adjusted_time < 0.5 { 1.0 } else { -1.0 };
210 self.amplitude * square_val
211 }
212 SeasonalPattern::Sawtooth => self.amplitude * (2.0 * phase_adjusted_time - 1.0),
213 SeasonalPattern::Custom(pattern) => {
214 if pattern.is_empty() {
215 0.0
216 } else {
217 let index = (phase_adjusted_time * pattern.len() as f64) as usize;
218 let index = index.min(pattern.len() - 1);
219 self.amplitude * pattern[index]
220 }
221 }
222 }
223 }
224}
225
226fn add_feature_correlations(
228 mut data: Array2<f64>,
229 correlation_strength: f64,
230) -> Result<Array2<f64>> {
231 let (n_samples, n_features) = data.dim();
232
233 if n_features < 2 {
234 return Ok(data);
235 }
236
237 for i in 1..n_features {
239 for j in 0..n_samples {
240 let original_value = data[[j, i]];
241 let correlated_value = data[[j, i - 1]];
242
243 data[[j, i]] = original_value * (1.0 - correlation_strength)
244 + correlated_value * correlation_strength;
245 }
246 }
247
248 Ok(data)
249}
250
251fn add_outliers(mut data: Array2<f64>, probability: f64, magnitude: f64) -> Result<Array2<f64>> {
253 let (n_samples, n_features) = data.dim();
254
255 for i in 0..n_samples {
256 for j in 0..n_features {
257 if fastrand::f64() < probability {
258 let outlier_direction = if fastrand::bool() { 1.0 } else { -1.0 };
259 data[[i, j]] += outlier_direction * magnitude;
260 }
261 }
262 }
263
264 Ok(data)
265}
266
267pub struct DataPreprocessor;
269
270impl DataPreprocessor {
271 pub fn standardize(data: &Array2<f64>) -> Result<(Array2<f64>, Array1<f64>, Array1<f64>)> {
273 let (n_samples, n_features) = data.dim();
274 let mut normalized = data.clone();
275 let mut means = Array1::zeros(n_features);
276 let mut stds = Array1::zeros(n_features);
277
278 for j in 0..n_features {
279 let column = data.column(j);
280 let mean = column.mean().unwrap_or(0.0);
281 let std = column.std(1.0).max(1e-8); means[j] = mean;
284 stds[j] = std;
285
286 for i in 0..n_samples {
287 normalized[[i, j]] = (data[[i, j]] - mean) / std;
288 }
289 }
290
291 Ok((normalized, means, stds))
292 }
293
294 pub fn min_max_scale(data: &Array2<f64>) -> Result<(Array2<f64>, Array1<f64>, Array1<f64>)> {
296 let (n_samples, n_features) = data.dim();
297 let mut scaled = data.clone();
298 let mut mins = Array1::zeros(n_features);
299 let mut ranges = Array1::zeros(n_features);
300
301 for j in 0..n_features {
302 let column = data.column(j);
303 let min_val = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
304 let max_val = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
305 let range = (max_val - min_val).max(1e-8); mins[j] = min_val;
308 ranges[j] = range;
309
310 for i in 0..n_samples {
311 scaled[[i, j]] = (data[[i, j]] - min_val) / range;
312 }
313 }
314
315 Ok((scaled, mins, ranges))
316 }
317
318 pub fn remove_outliers(data: &Array2<f64>, iqr_multiplier: f64) -> Result<Array2<f64>> {
320 let (n_samples, n_features) = data.dim();
321 let mut cleaned_data = Vec::new();
322
323 for i in 0..n_samples {
324 let mut is_outlier = false;
325
326 for j in 0..n_features {
327 let column = data.column(j);
328 let mut sorted_values: Vec<f64> = column.iter().cloned().collect();
329 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
330
331 let q1_idx = sorted_values.len() / 4;
332 let q3_idx = 3 * sorted_values.len() / 4;
333 let q1 = sorted_values[q1_idx];
334 let q3 = sorted_values[q3_idx];
335 let iqr = q3 - q1;
336
337 let lower_bound = q1 - iqr_multiplier * iqr;
338 let upper_bound = q3 + iqr_multiplier * iqr;
339
340 if data[[i, j]] < lower_bound || data[[i, j]] > upper_bound {
341 is_outlier = true;
342 break;
343 }
344 }
345
346 if !is_outlier {
347 cleaned_data.push(data.row(i).to_owned());
348 }
349 }
350
351 if cleaned_data.is_empty() {
352 return Err(MLError::DataError(
353 "All data points were classified as outliers".to_string(),
354 ));
355 }
356
357 let cleaned_samples = cleaned_data.len();
358 let mut cleaned_array = Array2::zeros((cleaned_samples, n_features));
359
360 for (i, row) in cleaned_data.iter().enumerate() {
361 cleaned_array.row_mut(i).assign(row);
362 }
363
364 Ok(cleaned_array)
365 }
366
367 pub fn interpolate_missing(data: &Array2<f64>) -> Result<Array2<f64>> {
369 let mut interpolated = data.clone();
370 let (n_samples, n_features) = data.dim();
371
372 for j in 0..n_features {
373 for i in 0..n_samples {
374 if !data[[i, j]].is_finite() {
375 let mut left_val = None;
377 let mut right_val = None;
378
379 for k in (0..i).rev() {
381 if data[[k, j]].is_finite() {
382 left_val = Some((k, data[[k, j]]));
383 break;
384 }
385 }
386
387 for k in (i + 1)..n_samples {
389 if data[[k, j]].is_finite() {
390 right_val = Some((k, data[[k, j]]));
391 break;
392 }
393 }
394
395 let interpolated_value = match (left_val, right_val) {
397 (Some((left_idx, left_val)), Some((right_idx, right_val))) => {
398 let weight = (i - left_idx) as f64 / (right_idx - left_idx) as f64;
399 left_val + weight * (right_val - left_val)
400 }
401 (Some((_, left_val)), None) => left_val,
402 (None, Some((_, right_val))) => right_val,
403 (None, None) => 0.0, };
405
406 interpolated[[i, j]] = interpolated_value;
407 }
408 }
409 }
410
411 Ok(interpolated)
412 }
413}
414
415pub struct TimeSeriesAnalyzer;
417
418impl TimeSeriesAnalyzer {
419 pub fn is_stationary(data: &Array1<f64>) -> bool {
421 if data.len() < 10 {
423 return false;
424 }
425
426 let mut differences = Array1::zeros(data.len() - 1);
427 for i in 1..data.len() {
428 differences[i - 1] = data[i] - data[i - 1];
429 }
430
431 let original_var = data.var(1.0);
432 let diff_var = differences.var(1.0);
433
434 diff_var < 0.8 * original_var
436 }
437
438 pub fn autocorrelation(data: &Array1<f64>, max_lag: usize) -> Array1<f64> {
440 let n = data.len();
441 let max_lag = max_lag.min(n / 2);
442 let mut autocorr = Array1::zeros(max_lag + 1);
443
444 let mean = data.mean().unwrap_or(0.0);
445 let variance = data.var(1.0);
446
447 for lag in 0..=max_lag {
448 let mut sum = 0.0;
449 let mut count = 0;
450
451 for t in lag..n {
452 sum += (data[t] - mean) * (data[t - lag] - mean);
453 count += 1;
454 }
455
456 if count > 0 && variance > 1e-10 {
457 autocorr[lag] = sum / (count as f64 * variance);
458 }
459 }
460
461 autocorr
462 }
463
464 pub fn detect_seasonality(data: &Array1<f64>, max_period: usize) -> Vec<usize> {
466 let autocorr = Self::autocorrelation(data, max_period);
467 let mut seasonal_periods = Vec::new();
468
469 for lag in 2..autocorr.len() {
471 if lag < autocorr.len() - 1 {
472 let current = autocorr[lag];
473 let prev = autocorr[lag - 1];
474 let next = autocorr[lag + 1];
475
476 if current > prev && current > next && current > 0.3 {
478 seasonal_periods.push(lag);
479 }
480 }
481 }
482
483 seasonal_periods
484 }
485
486 pub fn estimate_trend(data: &Array1<f64>) -> (f64, f64) {
488 let n = data.len() as f64;
489
490 let mut sum_x = 0.0;
491 let mut sum_y = 0.0;
492 let mut sum_xy = 0.0;
493 let mut sum_x2 = 0.0;
494
495 for (i, &y) in data.iter().enumerate() {
496 let x = i as f64;
497 sum_x += x;
498 sum_y += y;
499 sum_xy += x * y;
500 sum_x2 += x * x;
501 }
502
503 let denominator = n * sum_x2 - sum_x * sum_x;
504
505 if denominator.abs() > 1e-10 {
506 let slope = (n * sum_xy - sum_x * sum_y) / denominator;
507 let intercept = (sum_y - slope * sum_x) / n;
508 (slope, intercept)
509 } else {
510 (0.0, sum_y / n)
511 }
512 }
513}
514
515impl Default for SyntheticDataConfig {
517 fn default() -> Self {
518 Self {
519 num_features: 1,
520 base_level: 100.0,
521 trends: vec![TrendComponent {
522 trend_type: TrendType::Linear,
523 magnitude: 0.1,
524 parameters: Vec::new(),
525 }],
526 seasonalities: vec![SeasonalComponent {
527 period: 12,
528 amplitude: 10.0,
529 phase: 0.0,
530 pattern_type: SeasonalPattern::Sinusoidal,
531 }],
532 noise_level: 5.0,
533 feature_correlations: false,
534 outlier_probability: 0.02,
535 outlier_magnitude: 20.0,
536 }
537 }
538}
539
540impl SyntheticDataConfig {
541 pub fn financial() -> Self {
543 Self {
544 num_features: 5, base_level: 100.0,
546 trends: vec![
547 TrendComponent {
548 trend_type: TrendType::Linear,
549 magnitude: 0.05,
550 parameters: Vec::new(),
551 },
552 TrendComponent {
553 trend_type: TrendType::Sinusoidal,
554 magnitude: 2.0,
555 parameters: vec![0.02], },
557 ],
558 seasonalities: vec![
559 SeasonalComponent {
560 period: 5, amplitude: 3.0,
562 phase: 0.0,
563 pattern_type: SeasonalPattern::Sinusoidal,
564 },
565 SeasonalComponent {
566 period: 21, amplitude: 1.5,
568 phase: PI / 4.0,
569 pattern_type: SeasonalPattern::Sinusoidal,
570 },
571 ],
572 noise_level: 2.0,
573 feature_correlations: true,
574 outlier_probability: 0.05,
575 outlier_magnitude: 10.0,
576 }
577 }
578
579 pub fn iot_sensor() -> Self {
581 Self {
582 num_features: 3, base_level: 25.0,
584 trends: vec![TrendComponent {
585 trend_type: TrendType::Sinusoidal,
586 magnitude: 5.0,
587 parameters: vec![1.0 / 24.0], }],
589 seasonalities: vec![
590 SeasonalComponent {
591 period: 24, amplitude: 8.0,
593 phase: 0.0,
594 pattern_type: SeasonalPattern::Sinusoidal,
595 },
596 SeasonalComponent {
597 period: 168, amplitude: 3.0,
599 phase: PI / 3.0,
600 pattern_type: SeasonalPattern::Sinusoidal,
601 },
602 ],
603 noise_level: 1.0,
604 feature_correlations: true,
605 outlier_probability: 0.01,
606 outlier_magnitude: 15.0,
607 }
608 }
609
610 pub fn demand() -> Self {
612 Self {
613 num_features: 1,
614 base_level: 1000.0,
615 trends: vec![TrendComponent {
616 trend_type: TrendType::Linear,
617 magnitude: 2.0,
618 parameters: Vec::new(),
619 }],
620 seasonalities: vec![
621 SeasonalComponent {
622 period: 7, amplitude: 200.0,
624 phase: 0.0,
625 pattern_type: SeasonalPattern::Sinusoidal,
626 },
627 SeasonalComponent {
628 period: 365, amplitude: 500.0,
630 phase: PI / 2.0,
631 pattern_type: SeasonalPattern::Sinusoidal,
632 },
633 ],
634 noise_level: 50.0,
635 feature_correlations: false,
636 outlier_probability: 0.03,
637 outlier_magnitude: 300.0,
638 }
639 }
640}