Skip to main content

shape_runtime/data/
dataframe.rs

1//! Generic DataFrame for columnar time series data
2//!
3//! Industry-agnostic storage for any time series data with named columns.
4
5use super::Timeframe;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Generic columnar storage for time series data
10///
11/// Stores data as named columns of f64 values, plus timestamps.
12/// No knowledge of specific column names (open, high, low, close, etc.)
13/// is encoded here - that's determined by the data source.
14#[derive(Debug, Clone)]
15pub struct DataFrame {
16    /// Column name -> column data
17    pub columns: HashMap<String, Vec<f64>>,
18    /// Timestamps (always present, Unix seconds)
19    pub timestamps: Vec<i64>,
20    /// Generic identifier
21    pub id: String,
22    /// Timeframe of the data
23    pub timeframe: Timeframe,
24}
25
26impl DataFrame {
27    /// Create a new empty DataFrame
28    pub fn new(id: &str, timeframe: Timeframe) -> Self {
29        Self {
30            columns: HashMap::new(),
31            timestamps: Vec::new(),
32            id: id.to_string(),
33            timeframe,
34        }
35    }
36
37    /// Create a DataFrame with pre-allocated capacity
38    pub fn with_capacity(id: &str, timeframe: Timeframe, capacity: usize) -> Self {
39        Self {
40            columns: HashMap::new(),
41            timestamps: Vec::with_capacity(capacity),
42            id: id.to_string(),
43            timeframe,
44        }
45    }
46
47    /// Create from a list of rows
48    pub fn from_rows(id: &str, timeframe: Timeframe, rows: Vec<OwnedDataRow>) -> Self {
49        if rows.is_empty() {
50            return Self::new(id, timeframe);
51        }
52
53        let len = rows.len();
54        let mut columns: HashMap<String, Vec<f64>> = HashMap::new();
55        let mut timestamps = Vec::with_capacity(len);
56
57        // Infer schema from all rows (to handle sparse data if any) or just first row?
58        // Generic approach: iterate all rows
59        for row in &rows {
60            timestamps.push(row.timestamp);
61            for (key, value) in &row.fields {
62                columns
63                    .entry(key.clone())
64                    .or_insert_with(|| Vec::with_capacity(len))
65                    .push(*value);
66            }
67        }
68
69        // Pad shorter columns with NAN if necessary (though usually rows are uniform)
70        for col in columns.values_mut() {
71            while col.len() < timestamps.len() {
72                col.push(f64::NAN);
73            }
74        }
75
76        Self {
77            columns,
78            timestamps,
79            id: id.to_string(),
80            timeframe,
81        }
82    }
83
84    /// Add a column of data
85    pub fn add_column(&mut self, name: &str, data: Vec<f64>) {
86        self.columns.insert(name.to_string(), data);
87    }
88
89    /// Get a column by name
90    pub fn get_column(&self, name: &str) -> Option<&[f64]> {
91        self.columns.get(name).map(|v| v.as_slice())
92    }
93
94    /// Get a mutable column by name
95    pub fn get_column_mut(&mut self, name: &str) -> Option<&mut Vec<f64>> {
96        self.columns.get_mut(name)
97    }
98
99    /// Get the number of rows
100    pub fn row_count(&self) -> usize {
101        self.timestamps.len()
102    }
103
104    /// Check if empty
105    pub fn is_empty(&self) -> bool {
106        self.timestamps.is_empty()
107    }
108
109    /// Get the number of columns
110    pub fn column_count(&self) -> usize {
111        self.columns.len()
112    }
113
114    /// Get column names
115    pub fn column_names(&self) -> impl Iterator<Item = &str> {
116        self.columns.keys().map(|s| s.as_str())
117    }
118
119    /// Check if a column exists
120    pub fn has_column(&self, name: &str) -> bool {
121        self.columns.contains_key(name)
122    }
123
124    /// Get a row view at the given index
125    pub fn get_row(&self, index: usize) -> Option<DataRow<'_>> {
126        if index < self.row_count() {
127            Some(DataRow {
128                dataframe: self,
129                index,
130            })
131        } else {
132            None
133        }
134    }
135
136    /// Get timestamp at index
137    pub fn get_timestamp(&self, index: usize) -> Option<i64> {
138        self.timestamps.get(index).copied()
139    }
140
141    /// Get value at (row, column)
142    pub fn get_value(&self, row: usize, column: &str) -> Option<f64> {
143        self.columns
144            .get(column)
145            .and_then(|col| col.get(row))
146            .copied()
147    }
148
149    /// Create a schema from this DataFrame's columns
150    pub fn schema(&self) -> Vec<String> {
151        self.columns.keys().cloned().collect()
152    }
153
154    /// Slice the DataFrame to a range of rows
155    pub fn slice(&self, start: usize, end: usize) -> Self {
156        let end = end.min(self.row_count());
157        let start = start.min(end);
158
159        let mut df = Self::new(&self.id, self.timeframe);
160        df.timestamps = self.timestamps[start..end].to_vec();
161
162        for (name, col) in &self.columns {
163            df.columns.insert(name.clone(), col[start..end].to_vec());
164        }
165
166        df
167    }
168}
169
170impl Default for DataFrame {
171    fn default() -> Self {
172        Self::new("", Timeframe::default())
173    }
174}
175
176/// A borrowed view of a single row in a DataFrame
177///
178/// Zero-copy access to row data - just stores reference and index.
179#[derive(Debug, Clone, Copy)]
180pub struct DataRow<'a> {
181    dataframe: &'a DataFrame,
182    index: usize,
183}
184
185/// An owned data row with generic fields
186///
187/// This struct provides an industry-agnostic type for storing arbitrary
188/// data rows. It uses a HashMap for field storage to support any schema.
189///
190/// For performance-critical paths, the JIT compiler generates optimized
191/// code when the type schema is known at compile time.
192#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
193pub struct OwnedDataRow {
194    /// Unix timestamp (seconds) - always present in time series data
195    pub timestamp: i64,
196    /// Generic field storage - any f64 fields
197    pub fields: std::collections::HashMap<String, f64>,
198}
199
200// DELETED: Legacy OHLCV accessor methods
201// Use generic row.get_field("field_name") instead
202// Finance-specific field names belong in stdlib, not Rust core
203
204impl OwnedDataRow {
205    /// Create a new generic OwnedDataRow with arbitrary fields
206    pub fn new_generic(timestamp: i64, fields: std::collections::HashMap<String, f64>) -> Self {
207        Self { timestamp, fields }
208    }
209
210    /// Create from HashMap of fields (alias for new_generic)
211    pub fn from_hashmap(timestamp: i64, fields: std::collections::HashMap<String, f64>) -> Self {
212        Self::new_generic(timestamp, fields)
213    }
214
215    /// Create from a DataRow by copying all available columns
216    pub fn from_data_row(row: &DataRow<'_>) -> Option<Self> {
217        let mut fields = std::collections::HashMap::new();
218
219        // Copy all columns from the DataFrame
220        for col_name in row.dataframe.columns.keys() {
221            if let Some(value) = row.get(col_name) {
222                fields.insert(col_name.clone(), value);
223            }
224        }
225
226        Some(Self {
227            timestamp: row.timestamp(),
228            fields,
229        })
230    }
231
232    /// Get a field by name
233    pub fn get_field(&self, field: &str) -> Option<f64> {
234        self.fields.get(field).copied()
235    }
236
237    /// Set a field value
238    pub fn set_field(&mut self, field: &str, value: f64) {
239        self.fields.insert(field.to_string(), value);
240    }
241
242    /// Check if field exists
243    pub fn has_field(&self, field: &str) -> bool {
244        self.fields.contains_key(field)
245    }
246
247    /// Get all field names
248    pub fn field_names(&self) -> impl Iterator<Item = &String> {
249        self.fields.keys()
250    }
251
252    /// Get timestamp as DateTime<Utc>
253    pub fn datetime(&self) -> chrono::DateTime<chrono::Utc> {
254        chrono::DateTime::from_timestamp(self.timestamp, 0).unwrap_or_else(chrono::Utc::now)
255    }
256}
257
258impl<'a> DataRow<'a> {
259    /// Get the timestamp for this row
260    pub fn timestamp(&self) -> i64 {
261        self.dataframe.timestamps[self.index]
262    }
263
264    /// Get the row index
265    pub fn index(&self) -> usize {
266        self.index
267    }
268
269    /// Get a field value by name
270    pub fn get(&self, field: &str) -> Option<f64> {
271        self.dataframe
272            .columns
273            .get(field)
274            .and_then(|col| col.get(self.index))
275            .copied()
276    }
277
278    /// Get a field value with a default
279    pub fn get_or(&self, field: &str, default: f64) -> f64 {
280        self.get(field).unwrap_or(default)
281    }
282
283    /// Check if a field exists
284    pub fn has_field(&self, field: &str) -> bool {
285        self.dataframe.columns.contains_key(field)
286    }
287
288    /// Get all field names
289    pub fn fields(&self) -> impl Iterator<Item = &str> {
290        self.dataframe.column_names()
291    }
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    #[test]
299    fn test_dataframe_basic() {
300        let mut df = DataFrame::new("TEST", Timeframe::d1());
301
302        df.timestamps = vec![1000, 2000, 3000];
303        df.add_column("value", vec![1.0, 2.0, 3.0]);
304        df.add_column("other", vec![10.0, 20.0, 30.0]);
305
306        assert_eq!(df.row_count(), 3);
307        assert_eq!(df.column_count(), 2);
308        assert!(df.has_column("value"));
309        assert!(!df.has_column("missing"));
310
311        assert_eq!(df.get_value(1, "value"), Some(2.0));
312        assert_eq!(df.get_value(1, "other"), Some(20.0));
313    }
314
315    #[test]
316    fn test_datarow_access() {
317        let mut df = DataFrame::new("TEST", Timeframe::d1());
318        df.timestamps = vec![1000, 2000, 3000];
319        df.add_column("price", vec![100.0, 101.0, 102.0]);
320
321        let row = df.get_row(1).unwrap();
322        assert_eq!(row.timestamp(), 2000);
323        assert_eq!(row.get("price"), Some(101.0));
324        assert_eq!(row.get_or("missing", 0.0), 0.0);
325    }
326
327    #[test]
328    fn test_dataframe_slice() {
329        let mut df = DataFrame::new("TEST", Timeframe::d1());
330        df.timestamps = vec![1000, 2000, 3000, 4000, 5000];
331        df.add_column("value", vec![1.0, 2.0, 3.0, 4.0, 5.0]);
332
333        let sliced = df.slice(1, 4);
334        assert_eq!(sliced.row_count(), 3);
335        assert_eq!(sliced.timestamps, vec![2000, 3000, 4000]);
336        assert_eq!(sliced.get_column("value"), Some(&[2.0, 3.0, 4.0][..]));
337    }
338}