Skip to main content

so_tsa/
timeseries.rs

1//! Time series data structure with datetime indexing
2
3use ndarray::Array1;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7use so_core::data::{DataFrame, Series};
8use so_core::error::{Error, Result};
9
10/// Frequency of time series data
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
12pub enum Frequency {
13    /// Annual data
14    Yearly,
15    /// Quarterly data
16    Quarterly,
17    /// Monthly data
18    Monthly,
19    /// Weekly data
20    Weekly,
21    /// Daily data
22    Daily,
23    /// Hourly data
24    Hourly,
25    /// Minutely data
26    Minutely,
27    /// Secondly data
28    Secondly,
29    /// Custom frequency (custom steps per year)
30    Custom(u32),
31    /// Irregular/unspecified frequency
32    Irregular,
33}
34
35impl Frequency {
36    /// Get number of periods per year
37    pub fn periods_per_year(&self) -> Option<f64> {
38        match self {
39            Frequency::Yearly => Some(1.0),
40            Frequency::Quarterly => Some(4.0),
41            Frequency::Monthly => Some(12.0),
42            Frequency::Weekly => Some(52.1775), // Average weeks per year
43            Frequency::Daily => Some(365.25),   // Account for leap years
44            Frequency::Hourly => Some(365.25 * 24.0),
45            Frequency::Minutely => Some(365.25 * 24.0 * 60.0),
46            Frequency::Secondly => Some(365.25 * 24.0 * 60.0 * 60.0),
47            Frequency::Custom(n) => Some(*n as f64),
48            Frequency::Irregular => None,
49        }
50    }
51
52    /// Get frequency name
53    pub fn name(&self) -> &'static str {
54        match self {
55            Frequency::Yearly => "Yearly",
56            Frequency::Quarterly => "Quarterly",
57            Frequency::Monthly => "Monthly",
58            Frequency::Weekly => "Weekly",
59            Frequency::Daily => "Daily",
60            Frequency::Hourly => "Hourly",
61            Frequency::Minutely => "Minutely",
62            Frequency::Secondly => "Secondly",
63            Frequency::Custom(_n) => "Custom",
64            Frequency::Irregular => "Irregular",
65        }
66    }
67}
68
69/// Time series data structure
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct TimeSeries {
72    /// Series name
73    pub name: String,
74    /// Time index (Unix timestamps in seconds)
75    pub timestamps: Vec<i64>,
76    /// Values
77    pub values: Array1<f64>,
78    /// Frequency (if regular)
79    pub frequency: Option<Frequency>,
80    /// Metadata
81    pub metadata: HashMap<String, String>,
82}
83
84impl TimeSeries {
85    /// Create a new time series from arrays
86    pub fn new(
87        name: impl Into<String>,
88        timestamps: Vec<i64>,
89        values: Array1<f64>,
90        frequency: Option<Frequency>,
91    ) -> Result<Self> {
92        if timestamps.len() != values.len() {
93            return Err(Error::DimensionMismatch(format!(
94                "Timestamps length {} != values length {}",
95                timestamps.len(),
96                values.len()
97            )));
98        }
99
100        if values.len() < 2 {
101            return Err(Error::DataError(
102                "Time series must have at least 2 observations".to_string(),
103            ));
104        }
105
106        // Check if timestamps are sorted
107        for i in 1..timestamps.len() {
108            if timestamps[i] <= timestamps[i - 1] {
109                return Err(Error::DataError(
110                    "Timestamps must be strictly increasing".to_string(),
111                ));
112            }
113        }
114
115        Ok(Self {
116            name: name.into(),
117            timestamps,
118            values,
119            frequency,
120            metadata: HashMap::new(),
121        })
122    }
123
124    /// Create from DataFrame with date and value columns
125    pub fn from_dataframe(df: &DataFrame, value_col: &str, date_col: &str) -> Result<Self> {
126        let value_series = df
127            .column(value_col)
128            .ok_or_else(|| Error::DataError(format!("Value column '{}' not found", value_col)))?;
129
130        let date_series = df
131            .column(date_col)
132            .ok_or_else(|| Error::DataError(format!("Date column '{}' not found", date_col)))?;
133
134        // Convert date column to timestamps
135        // For now, assume dates are already in timestamp format or can be parsed
136        let timestamps: Vec<i64> = date_series.data().iter().map(|&x| x as i64).collect();
137
138        Self::new(
139            value_col,
140            timestamps,
141            value_series.data().to_owned(),
142            None, // Auto-detect frequency
143        )
144    }
145
146    /// Create a regular time series with integer index
147    pub fn regular(name: impl Into<String>, values: Array1<f64>, frequency: Frequency) -> Self {
148        let n = values.len();
149        let timestamps: Vec<i64> = (0..n).map(|i| i as i64).collect();
150
151        Self {
152            name: name.into(),
153            timestamps,
154            values,
155            frequency: Some(frequency),
156            metadata: HashMap::new(),
157        }
158    }
159
160    /// Get series length
161    pub fn len(&self) -> usize {
162        self.values.len()
163    }
164
165    /// Check if series is empty
166    pub fn is_empty(&self) -> bool {
167        self.values.is_empty()
168    }
169
170    /// Get series name
171    pub fn name(&self) -> &str {
172        &self.name
173    }
174
175    /// Get values
176    pub fn values(&self) -> &Array1<f64> {
177        &self.values
178    }
179
180    /// Get timestamps
181    pub fn timestamps(&self) -> &[i64] {
182        &self.timestamps
183    }
184
185    /// Get frequency
186    pub fn frequency(&self) -> Option<Frequency> {
187        self.frequency
188    }
189
190    /// Set metadata
191    pub fn set_metadata(&mut self, key: impl Into<String>, value: impl Into<String>) {
192        self.metadata.insert(key.into(), value.into());
193    }
194
195    /// Get metadata
196    pub fn get_metadata(&self, key: &str) -> Option<&String> {
197        self.metadata.get(key)
198    }
199
200    /// Get start time
201    pub fn start_time(&self) -> Option<i64> {
202        self.timestamps.first().copied()
203    }
204
205    /// Get end time
206    pub fn end_time(&self) -> Option<i64> {
207        self.timestamps.last().copied()
208    }
209
210    /// Compute basic statistics
211    pub fn stats(&self) -> TimeSeriesStats {
212        let n = self.len() as f64;
213
214        let mean = self.values.mean().unwrap_or(0.0);
215        let variance = self.values.var(1.0);
216        let std = variance.sqrt();
217
218        // Autocorrelation at lag 1
219        let acf1 = if n > 1.0 {
220            let mut sum = 0.0;
221            for i in 1..self.len() {
222                sum += (self.values[i] - mean) * (self.values[i - 1] - mean);
223            }
224            sum / ((n - 1.0) * variance)
225        } else {
226            0.0
227        };
228
229        TimeSeriesStats {
230            n_obs: self.len(),
231            mean,
232            std,
233            variance,
234            min: self.values.iter().fold(f64::INFINITY, |a, &b| a.min(b)),
235            max: self.values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b)),
236            acf1,
237        }
238    }
239
240    /// Differencing: y_t - y_{t-d}
241    pub fn diff(&self, lag: usize, order: usize) -> Result<TimeSeries> {
242        if lag < 1 {
243            return Err(Error::DataError("Lag must be >= 1".to_string()));
244        }
245
246        if order < 1 {
247            return Ok(self.clone());
248        }
249
250        let mut current = self.values.clone();
251        let mut timestamps = self.timestamps[lag..].to_vec();
252
253        for _ in 0..order {
254            let n = current.len();
255            if n <= lag {
256                return Err(Error::DataError(
257                    "Not enough observations for differencing".to_string(),
258                ));
259            }
260
261            let diffed: Array1<f64> = (lag..n).map(|i| current[i] - current[i - lag]).collect();
262
263            current = diffed;
264            timestamps = timestamps[lag..].to_vec();
265        }
266
267        TimeSeries::new(
268            format!("{}_diff{}", self.name, order),
269            timestamps,
270            current,
271            self.frequency,
272        )
273    }
274
275    /// Log transformation (with offset to handle zeros/negatives)
276    pub fn log(&self, offset: f64) -> TimeSeries {
277        let values = self.values.mapv(|x| (x + offset).ln());
278
279        TimeSeries {
280            name: format!("log({})", self.name),
281            timestamps: self.timestamps.clone(),
282            values,
283            frequency: self.frequency,
284            metadata: self.metadata.clone(),
285        }
286    }
287
288    /// Box-Cox transformation
289    pub fn boxcox(&self, lambda: f64) -> TimeSeries {
290        let values = if lambda == 0.0 {
291            self.values.mapv(|x| x.ln())
292        } else {
293            self.values.mapv(|x| (x.powf(lambda) - 1.0) / lambda)
294        };
295
296        TimeSeries {
297            name: format!("boxcox({}, λ={})", self.name, lambda),
298            timestamps: self.timestamps.clone(),
299            values,
300            frequency: self.frequency,
301            metadata: self.metadata.clone(),
302        }
303    }
304
305    /// Slice time series
306    pub fn slice(&self, start: Option<i64>, end: Option<i64>) -> Result<TimeSeries> {
307        let start_idx = match start {
308            Some(t) => self.timestamps.iter().position(|&ts| ts >= t).unwrap_or(0),
309            None => 0,
310        };
311
312        let end_idx = match end {
313            Some(t) => self
314                .timestamps
315                .iter()
316                .rposition(|&ts| ts <= t)
317                .map(|pos| pos + 1)
318                .unwrap_or(self.len()),
319            None => self.len(),
320        };
321
322        if start_idx >= end_idx {
323            return Err(Error::DataError("Invalid slice: start >= end".to_string()));
324        }
325
326        TimeSeries::new(
327            self.name.clone(),
328            self.timestamps[start_idx..end_idx].to_vec(),
329            self.values
330                .slice(ndarray::s![start_idx..end_idx])
331                .to_owned(),
332            self.frequency,
333        )
334    }
335
336    /// Fill missing values (NaN) using specified method
337    pub fn fillna(&self, method: FillMethod) -> TimeSeries {
338        let values = self.values.to_vec();
339        let filled = match method {
340            FillMethod::Mean => {
341                let mean = self.values.mean().unwrap_or(0.0);
342                values
343                    .iter()
344                    .map(|&x| if x.is_nan() { mean } else { x })
345                    .collect()
346            }
347            FillMethod::Median => {
348                let mut sorted: Vec<f64> =
349                    values.iter().filter(|&&x| !x.is_nan()).copied().collect();
350                sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
351                let median = if sorted.is_empty() {
352                    0.0
353                } else if sorted.len() % 2 == 0 {
354                    (sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
355                } else {
356                    sorted[sorted.len() / 2]
357                };
358                values
359                    .iter()
360                    .map(|&x| if x.is_nan() { median } else { x })
361                    .collect()
362            }
363            FillMethod::ForwardFill => {
364                let mut last_valid = 0.0;
365                values
366                    .iter()
367                    .map(|&x| {
368                        if !x.is_nan() {
369                            last_valid = x;
370                            x
371                        } else {
372                            last_valid
373                        }
374                    })
375                    .collect()
376            }
377            FillMethod::BackwardFill => {
378                let mut filled = values.clone();
379                let mut last_valid = 0.0;
380                for i in (0..filled.len()).rev() {
381                    if !filled[i].is_nan() {
382                        last_valid = filled[i];
383                    } else {
384                        filled[i] = last_valid;
385                    }
386                }
387                filled
388            }
389            FillMethod::Linear => {
390                let mut filled = values.clone();
391                let mut i = 0;
392                while i < filled.len() {
393                    if filled[i].is_nan() {
394                        let start = i;
395                        while i < filled.len() && filled[i].is_nan() {
396                            i += 1;
397                        }
398                        let end = i;
399
400                        if start > 0 && end < filled.len() {
401                            let prev_val = filled[start - 1];
402                            let next_val = filled[end];
403                            let step = (next_val - prev_val) / (end - start + 1) as f64;
404
405                            for j in start..end {
406                                filled[j] = prev_val + step * (j - start + 1) as f64;
407                            }
408                        }
409                    }
410                    i += 1;
411                }
412                filled
413            }
414        };
415
416        TimeSeries {
417            name: format!("{}_filled", self.name),
418            timestamps: self.timestamps.clone(),
419            values: Array1::from_vec(filled),
420            frequency: self.frequency,
421            metadata: self.metadata.clone(),
422        }
423    }
424
425    /// Detect outliers using IQR method
426    pub fn detect_outliers(&self, threshold: f64) -> Vec<usize> {
427        // Manual quantile calculation to avoid ndarray_stats dependency issues
428        let mut sorted: Vec<f64> = self.values.to_vec();
429        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
430
431        let n = sorted.len();
432        let q1_idx = ((n as f64) * 0.25).floor() as usize;
433        let q3_idx = ((n as f64) * 0.75).floor() as usize;
434
435        let q1 = sorted.get(q1_idx).copied().unwrap_or(0.0);
436        let q3 = sorted.get(q3_idx).copied().unwrap_or(0.0);
437        let iqr = q3 - q1;
438
439        let lower_bound = q1 - threshold * iqr;
440        let upper_bound = q3 + threshold * iqr;
441
442        self.values
443            .iter()
444            .enumerate()
445            .filter(|&(_, &x)| x < lower_bound || x > upper_bound)
446            .map(|(i, _)| i)
447            .collect()
448    }
449
450    /// Convert to DataFrame
451    pub fn to_dataframe(&self) -> DataFrame {
452        let mut columns = HashMap::new();
453
454        // Add timestamps
455        let timestamps_series = Series::new(
456            "timestamp",
457            Array1::from_vec(self.timestamps.iter().map(|&t| t as f64).collect()),
458        );
459        columns.insert("timestamp".to_string(), timestamps_series);
460
461        // Add values
462        let values_series = Series::new(&self.name, self.values.clone());
463        columns.insert(self.name.clone(), values_series);
464
465        DataFrame::from_series(columns).unwrap_or_default()
466    }
467}
468
469/// Time series statistics
470#[derive(Debug, Clone)]
471pub struct TimeSeriesStats {
472    pub n_obs: usize,
473    pub mean: f64,
474    pub std: f64,
475    pub variance: f64,
476    pub min: f64,
477    pub max: f64,
478    pub acf1: f64,
479}
480
481/// Methods for filling missing values
482#[derive(Debug, Clone, Copy, PartialEq)]
483pub enum FillMethod {
484    Mean,
485    Median,
486    ForwardFill,
487    BackwardFill,
488    Linear,
489}
490
491impl TimeSeriesStats {
492    /// Create summary string
493    pub fn summary(&self) -> String {
494        format!(
495            "Observations: {}\nMean: {:.4}\nStd: {:.4}\nMin: {:.4}\nMax: {:.4}\nACF(1): {:.4}",
496            self.n_obs, self.mean, self.std, self.min, self.max, self.acf1
497        )
498    }
499}