1use std::any::Any;
2use std::collections::HashMap;
3use std::fmt::Debug;
4use std::path::Path;
5
6use crate::column::ColumnType;
7use crate::core::data_value::{self, DataValue as DValue}; use crate::core::error::{Error, Result};
9
10#[deprecated(
12 since = "0.1.0-alpha.2",
13 note = "Use new DataFrame implementation in crate::dataframe::base"
14)]
15pub use crate::dataframe::DataFrame as LegacyDataFrame;
16
17trait ColumnAny: Debug + Send + Sync {
19 fn as_any(&self) -> &dyn Any;
20 fn len(&self) -> usize;
21 fn is_empty(&self) -> bool {
22 self.len() == 0
23 }
24 fn column_type_string(&self) -> String;
25 fn clone_box(&self) -> Box<dyn ColumnAny + Send + Sync>;
26}
27
28impl<T: 'static + Debug + Clone + Send + Sync> ColumnAny for crate::series::Series<T> {
29 fn as_any(&self) -> &dyn Any {
30 self
31 }
32
33 fn len(&self) -> usize {
34 self.len()
35 }
36
37 fn column_type_string(&self) -> String {
38 std::any::type_name::<T>().to_string()
39 }
40
41 fn clone_box(&self) -> Box<dyn ColumnAny + Send + Sync> {
42 Box::new(self.clone())
43 }
44}
45
46impl Clone for Box<dyn ColumnAny + Send + Sync> {
47 fn clone(&self) -> Self {
48 self.clone_box()
49 }
50}
51
52#[derive(Debug, Clone)]
54pub struct DataFrame {
55 columns: HashMap<String, Box<dyn ColumnAny + Send + Sync>>,
57 column_order: Vec<String>,
58 row_count: usize,
59}
60
61impl DataFrame {
62 pub fn new() -> Self {
64 Self {
65 columns: HashMap::new(),
66 column_order: Vec::new(),
67 row_count: 0,
68 }
69 }
70
71 pub fn with_index(index: crate::index::Index<String>) -> Self {
73 let mut df = Self::new();
74 df.row_count = index.len();
75 df
76 }
77
78 pub fn with_multi_index(multi_index: crate::index::MultiIndex<String>) -> Self {
80 let mut df = Self::new();
81 df.row_count = multi_index.len();
82 df
83 }
84
85 pub fn contains_column(&self, column_name: &str) -> bool {
87 self.columns.contains_key(column_name)
88 }
89
90 pub fn row_count(&self) -> usize {
92 self.row_count
93 }
94
95 pub fn nrows(&self) -> usize {
97 self.row_count
98 }
99
100 pub fn get_string_value(&self, column_name: &str, row_idx: usize) -> Result<&str> {
102 let col = self
104 .columns
105 .get(column_name)
106 .ok_or_else(|| Error::ColumnNotFound(column_name.to_string()))?;
107
108 if row_idx >= self.row_count {
110 return Err(Error::InvalidValue(format!(
111 "Row index {} is out of bounds for DataFrame with {} rows",
112 row_idx, self.row_count
113 )));
114 }
115
116 if let Some(string_series) = col.as_any().downcast_ref::<crate::series::Series<String>>() {
118 if let Some(value) = string_series.get(row_idx) {
119 Ok(value)
120 } else {
121 Err(Error::InvalidValue(format!(
122 "No value found at row {} in column '{}'",
123 row_idx, column_name
124 )))
125 }
126 } else {
127 Err(Error::InvalidValue(format!(
130 "Column '{}' is not a string column. Use get_column_string_values() for type conversion.",
131 column_name
132 )))
133 }
134 }
135
136 pub fn add_column<T: 'static + Debug + Clone + Send + Sync>(
138 &mut self,
139 column_name: String,
140 series: crate::series::Series<T>,
141 ) -> Result<()> {
142 if self.contains_column(&column_name) {
144 return Err(Error::DuplicateColumnName(column_name));
145 }
146
147 let series_len = series.len();
149 if !self.columns.is_empty() && series_len != self.row_count {
150 return Err(Error::InconsistentRowCount {
151 expected: self.row_count,
152 found: series_len,
153 });
154 }
155
156 self.columns.insert(column_name.clone(), Box::new(series));
158 self.column_order.push(column_name);
159
160 if self.row_count == 0 {
162 self.row_count = series_len;
163 }
164
165 Ok(())
166 }
167
168 pub fn column_names(&self) -> Vec<String> {
170 self.column_order.clone()
171 }
172
173 pub fn rename_columns(&mut self, column_map: &HashMap<String, String>) -> Result<()> {
175 for old_name in column_map.keys() {
177 if !self.contains_column(old_name) {
178 return Err(Error::ColumnNotFound(old_name.clone()));
179 }
180 }
181
182 let mut new_names_set = std::collections::HashSet::new();
184 for new_name in column_map.values() {
185 if !new_names_set.insert(new_name) {
186 return Err(Error::DuplicateColumnName(new_name.clone()));
187 }
188 }
189
190 for new_name in column_map.values() {
192 if self.contains_column(new_name) && !column_map.contains_key(new_name) {
193 return Err(Error::DuplicateColumnName(new_name.clone()));
194 }
195 }
196
197 for (old_name, new_name) in column_map {
199 if let Some(pos) = self.column_order.iter().position(|x| x == old_name) {
201 self.column_order[pos] = new_name.clone();
202 }
203
204 if let Some(column_data) = self.columns.remove(old_name) {
206 self.columns.insert(new_name.clone(), column_data);
207 }
208 }
209
210 Ok(())
211 }
212
213 pub fn set_column_names(&mut self, names: Vec<String>) -> Result<()> {
215 if names.len() != self.column_order.len() {
217 return Err(Error::InconsistentRowCount {
218 expected: self.column_order.len(),
219 found: names.len(),
220 });
221 }
222
223 let mut names_set = std::collections::HashSet::new();
225 for name in &names {
226 if !names_set.insert(name) {
227 return Err(Error::DuplicateColumnName(name.clone()));
228 }
229 }
230
231 let mut column_map = HashMap::new();
233 for (old_name, new_name) in self.column_order.iter().zip(names.iter()) {
234 column_map.insert(old_name.clone(), new_name.clone());
235 }
236
237 self.rename_columns(&column_map)
239 }
240
241 pub fn get_column<T: 'static + Debug + Clone + Send + Sync>(
243 &self,
244 column_name: &str,
245 ) -> Result<&crate::series::Series<T>> {
246 let col = self
247 .columns
248 .get(column_name)
249 .ok_or_else(|| Error::ColumnNotFound(column_name.to_string()))?;
250
251 match col.as_any().downcast_ref::<crate::series::Series<T>>() {
253 Some(series) => Ok(series),
254 None => Err(Error::InvalidValue(format!(
255 "Column '{}' is not of the requested type",
256 column_name
257 ))),
258 }
259 }
260
261 pub fn get_column_string_values(&self, column_name: &str) -> Result<Vec<String>> {
263 if !self.contains_column(column_name) {
264 return Err(Error::ColumnNotFound(column_name.to_string()));
265 }
266
267 let column = self.columns.get(column_name).unwrap();
268
269 if let Some(string_series) = column
271 .as_any()
272 .downcast_ref::<crate::series::Series<String>>()
273 {
274 Ok(string_series.values().to_vec())
275 } else if let Some(i32_series) =
276 column.as_any().downcast_ref::<crate::series::Series<i32>>()
277 {
278 Ok(i32_series
279 .values()
280 .iter()
281 .map(|v| ToString::to_string(v))
282 .collect())
283 } else if let Some(i64_series) =
284 column.as_any().downcast_ref::<crate::series::Series<i64>>()
285 {
286 Ok(i64_series
287 .values()
288 .iter()
289 .map(|v| ToString::to_string(v))
290 .collect())
291 } else if let Some(f32_series) =
292 column.as_any().downcast_ref::<crate::series::Series<f32>>()
293 {
294 Ok(f32_series
295 .values()
296 .iter()
297 .map(|v| ToString::to_string(v))
298 .collect())
299 } else if let Some(f64_series) =
300 column.as_any().downcast_ref::<crate::series::Series<f64>>()
301 {
302 Ok(f64_series
303 .values()
304 .iter()
305 .map(|v| ToString::to_string(v))
306 .collect())
307 } else if let Some(bool_series) = column
308 .as_any()
309 .downcast_ref::<crate::series::Series<bool>>()
310 {
311 Ok(bool_series
312 .values()
313 .iter()
314 .map(|v| ToString::to_string(v))
315 .collect())
316 } else {
317 let mut result = Vec::with_capacity(self.row_count);
319 for i in 0..self.row_count {
320 result.push(format!("unsupported_type_{}_{}", column_name, i));
321 }
322 Ok(result)
323 }
324 }
325
326 pub fn column_name(&self, idx: usize) -> Option<&String> {
328 self.column_order.get(idx)
329 }
330
331 pub fn concat_rows(&self, _other: &DataFrame) -> Result<DataFrame> {
333 Ok(Self::new())
335 }
336
337 pub fn to_csv<P: AsRef<Path>>(&self, _path: P) -> Result<()> {
339 Ok(())
341 }
342
343 pub fn from_csv<P: AsRef<Path>>(_path: P, _has_header: bool) -> Result<Self> {
345 Ok(Self::new())
347 }
348
349 pub fn from_csv_reader<R: std::io::Read>(
351 reader: &mut csv::Reader<R>,
352 has_header: bool,
353 ) -> Result<Self> {
354 let mut df = Self::new();
355
356 let headers: Vec<String> = if has_header {
358 reader
359 .headers()
360 .map_err(|e| Error::IoError(format!("CSV header error: {}", e)))?
361 .iter()
362 .map(|h| h.to_string())
363 .collect()
364 } else {
365 let mut records = reader.records();
367 if let Some(first_record) = records.next() {
368 let record =
369 first_record.map_err(|e| Error::IoError(format!("CSV read error: {}", e)))?;
370 (0..record.len()).map(|i| format!("column_{}", i)).collect()
371 } else {
372 return Ok(df); }
374 };
375
376 let mut columns_data: std::collections::HashMap<String, Vec<String>> =
378 std::collections::HashMap::new();
379 for header in &headers {
380 columns_data.insert(header.clone(), Vec::new());
381 }
382
383 for result in reader.records() {
385 let record = result.map_err(|e| Error::IoError(format!("CSV read error: {}", e)))?;
386 for (i, header) in headers.iter().enumerate() {
387 let value = if i < record.len() {
388 record[i].to_string()
389 } else {
390 String::new()
391 };
392 columns_data.get_mut(header).unwrap().push(value);
393 }
394 }
395
396 for header in headers {
398 if let Some(values) = columns_data.remove(&header) {
399 let series = crate::series::Series::new(values, Some(header.clone()))?;
400 df.add_column(header, series)?;
401 }
402 }
403
404 Ok(df)
405 }
406
407 pub fn column_count(&self) -> usize {
409 self.columns.len()
410 }
411
412 pub fn ncols(&self) -> usize {
414 self.column_count()
415 }
416
417 pub fn select_columns(&self, columns: &[&str]) -> Result<Self> {
419 let result = Self::new();
420
421 for &column_name in columns {
422 if !self.contains_column(column_name) {
423 return Err(Error::ColumnNotFound(column_name.to_string()));
424 }
425
426 }
429
430 Ok(result)
431 }
432
433 pub fn from_map(
435 data: std::collections::HashMap<String, Vec<String>>,
436 index: Option<crate::index::Index<String>>,
437 ) -> Result<Self> {
438 let mut df = Self::new();
439
440 if let Some(idx) = index {
442 df.row_count = idx.len();
443 } else {
444 df.row_count = data.values().map(|v| v.len()).max().unwrap_or(0);
446 }
447
448 for (col_name, values) in data {
450 let series = crate::series::Series::new(values, Some(col_name.clone()))?;
452 df.add_column(col_name, series)?;
453 }
454
455 Ok(df)
456 }
457
458 pub fn from_json(json_str: &str) -> Result<Self> {
461 use serde_json::Value;
462
463 let parsed: Value = serde_json::from_str(json_str)
465 .map_err(|e| Error::InvalidInput(format!("Failed to parse JSON: {}", e)))?;
466
467 let mut data: std::collections::HashMap<String, Vec<String>> =
469 std::collections::HashMap::new();
470
471 if let Value::Object(obj) = parsed {
472 for (col_name, col_values) in obj {
473 if let Value::Array(values) = col_values {
474 let string_values: Vec<String> = values
475 .into_iter()
476 .map(|v| match v {
477 Value::String(s) => s,
478 Value::Number(n) => n.to_string(),
479 Value::Bool(b) => ToString::to_string(&b),
480 Value::Null => "".to_string(),
481 _ => v.to_string(),
482 })
483 .collect();
484 data.insert(col_name, string_values);
485 } else {
486 return Err(Error::InvalidInput(format!(
487 "Column '{}' is not an array",
488 col_name
489 )));
490 }
491 }
492 } else {
493 return Err(Error::InvalidInput("JSON must be an object".to_string()));
494 }
495
496 Self::from_map(data, None)
498 }
499
500 pub fn has_column(&self, column_name: &str) -> bool {
502 self.contains_column(column_name)
503 }
504
505 pub fn get_index(&self) -> crate::index::DataFrameIndex<String> {
507 crate::index::DataFrameIndex::Simple(crate::index::Index::default())
510 }
511
512 pub fn set_index(&mut self, index: crate::index::Index<String>) -> Result<()> {
514 Ok(())
516 }
517
518 pub fn set_multi_index(&mut self, multi_index: crate::index::MultiIndex<String>) -> Result<()> {
520 Ok(())
522 }
523
524 pub fn get_column_numeric_values(&self, column_name: &str) -> Result<Vec<f64>> {
528 let col = self
530 .columns
531 .get(column_name)
532 .ok_or_else(|| Error::ColumnNotFound(column_name.to_string()))?;
533
534 let mut values = Vec::with_capacity(self.row_count);
536 for i in 0..self.row_count {
537 let val = match col.as_any().downcast_ref::<crate::series::Series<f64>>() {
539 Some(float_series) => {
540 if let Some(value) = float_series.get(i) {
541 *value } else {
543 return Err(Error::InvalidValue(format!(
544 "Missing value at index {} in column '{}'",
545 i, column_name
546 )));
547 }
548 }
549 None => {
550 match col.as_any().downcast_ref::<crate::series::Series<i64>>() {
552 Some(int_series) => {
553 if let Some(value) = int_series.get(i) {
554 *value as f64 } else {
556 return Err(Error::InvalidValue(format!(
557 "Missing value at index {} in column '{}'",
558 i, column_name
559 )));
560 }
561 }
562 None => {
563 match col.as_any().downcast_ref::<crate::series::Series<String>>() {
565 Some(str_series) => {
566 if let Some(value) = str_series.get(i) {
567 match value.parse::<f64>() {
569 Ok(num) => num,
570 Err(_) => return Err(Error::InvalidValue(format!(
571 "Value '{}' at index {} in column '{}' cannot be converted to numeric",
572 value, i, column_name
573 ))),
574 }
575 } else {
576 return Err(Error::InvalidValue(format!(
577 "Missing value at index {} in column '{}'",
578 i, column_name
579 )));
580 }
581 }
582 None => {
583 return Err(Error::InvalidValue(format!(
585 "Column '{}' cannot be converted to numeric values",
586 column_name
587 )));
588 }
589 }
590 }
591 }
592 }
593 };
594
595 values.push(val);
596 }
597
598 Ok(values)
599 }
600
601 pub fn add_row_data(&mut self, row_data: Vec<Box<dyn DValue>>) -> Result<()> {
603 if row_data.len() != self.column_order.len() {
605 return Err(Error::InconsistentRowCount {
606 expected: self.column_order.len(),
607 found: row_data.len(),
608 });
609 }
610
611 self.row_count += 1;
613
614 Ok(())
615 }
616
617 pub fn filter<F>(&self, column_name: &str, predicate: F) -> Result<Self>
619 where
620 F: Fn(&Box<dyn DValue>) -> bool,
621 {
622 if !self.contains_column(column_name) {
624 return Err(Error::ColumnNotFound(column_name.to_string()));
625 }
626
627 Ok(Self::new())
629 }
630
631 pub fn mean(&self, column_name: &str) -> Result<f64> {
633 let values = self.get_column_numeric_values(column_name)?;
635
636 if values.is_empty() {
637 return Err(Error::EmptySeries);
638 }
639
640 let sum: f64 = values.iter().sum();
642 Ok(sum / values.len() as f64)
643 }
644
645 pub fn group_by(&self, _column_name: &str) -> Result<()> {
647 Ok(())
649 }
650
651 pub fn gpu_accelerate(&self) -> Result<Self> {
653 Ok(self.clone())
655 }
656
657 pub fn corr_matrix(&self, _columns: &[&str]) -> Result<()> {
659 Ok(())
661 }
662
663 pub fn head(&self, n: usize) -> Result<String> {
665 let mut result = String::new();
666
667 for col_name in &self.column_order {
669 result.push_str(&format!("{}\t", col_name));
670 }
671 result.push('\n');
672
673 let row_limit = n.min(self.row_count);
675 for row_idx in 0..row_limit {
676 for col_name in &self.column_order {
677 result.push_str("[val]\t");
679 }
680 result.push('\n');
681 }
682
683 Ok(result)
684 }
685
686 pub fn add_row_data_from_hashmap(&mut self, row_data: HashMap<String, String>) -> Result<()> {
688 for col_name in row_data.keys() {
690 if !self.contains_column(col_name) {
691 return Err(Error::ColumnNotFound(col_name.clone()));
692 }
693 }
694
695 self.row_count += 1;
697
698 Ok(())
699 }
700
701 pub fn is_categorical(&self, column_name: &str) -> bool {
703 self.contains_column(column_name)
706 }
707
708 pub fn sample(&self, indices: &[usize]) -> Result<Self> {
710 Ok(Self::new())
712 }
713
714 pub fn get_categorical<T: 'static + Debug + Clone + Eq + std::hash::Hash + Send + Sync>(
716 &self,
717 column_name: &str,
718 ) -> Result<crate::series::categorical::Categorical<T>> {
719 if !self.contains_column(column_name) {
721 return Err(Error::ColumnNotFound(column_name.to_string()));
722 }
723
724 let values_str = self.get_column_string_values(column_name)?;
726
727 if std::any::TypeId::of::<T>() == std::any::TypeId::of::<String>() {
730 let values: Vec<T> = unsafe { std::mem::transmute(values_str) };
732
733 return crate::series::categorical::Categorical::new(values, None, false);
735 }
736
737 let empty_vec: Vec<T> = Vec::new();
739 crate::series::categorical::Categorical::new(empty_vec, None, false)
740 }
741
742 pub fn is_numeric_column(&self, column_name: &str) -> bool {
744 false
746 }
747
748 pub fn add_na_series_as_categorical(
750 &mut self,
751 name: String,
752 series: crate::series::NASeries<String>,
753 categories: Option<Vec<String>>,
754 ordered: Option<crate::series::categorical::CategoricalOrder>,
755 ) -> Result<&mut Self> {
756 let cat = crate::series::categorical::StringCategorical::from_na_vec(
758 series.values().to_vec(),
759 categories,
760 ordered,
761 )?;
762
763 let regular_series = cat.to_series(Some(name.clone()))?;
765
766 self.add_column(name, regular_series)?;
768
769 Ok(self)
770 }
771
772 pub fn from_categoricals(
774 categoricals: Vec<(String, crate::series::categorical::StringCategorical)>,
775 ) -> Result<Self> {
776 let mut df = Self::new();
777
778 if !categoricals.is_empty() {
780 let first_len = categoricals[0].1.len();
781 for (name, cat) in &categoricals {
782 if cat.len() != first_len {
783 return Err(Error::InconsistentRowCount {
784 expected: first_len,
785 found: cat.len(),
786 });
787 }
788 }
789 }
790
791 for (name, cat) in categoricals {
792 let series = cat.to_series(Some(name.clone()))?;
794
795 df.add_column(name.clone(), series)?;
797 }
798
799 Ok(df)
800 }
801
802 pub fn value_counts(&self, column_name: &str) -> Result<crate::series::Series<usize>> {
804 if !self.contains_column(column_name) {
806 return Err(Error::ColumnNotFound(column_name.to_string()));
807 }
808
809 let values = self.get_column_string_values(column_name)?;
811
812 let mut counts = std::collections::HashMap::new();
814 for value in values {
815 *counts.entry(value).or_insert(0) += 1;
816 }
817
818 let mut values_vec = Vec::new();
820 let mut counts_vec = Vec::new();
821
822 for (value, count) in counts {
823 values_vec.push(value);
824 counts_vec.push(count);
825 }
826
827 crate::series::Series::new(counts_vec, Some(format!("{}_counts", column_name)))
829 }
830}