pandrs/dataframe/
base.rs

1use std::any::Any;
2use std::collections::HashMap;
3use std::fmt::Debug;
4use std::path::Path;
5
6use crate::column::ColumnType;
7use crate::core::data_value::{self, DataValue as DValue}; // Import as a different name to avoid trait conflict
8use crate::core::error::{Error, Result};
9
10// Re-export from legacy module for now
11#[deprecated(
12    since = "0.1.0-alpha.2",
13    note = "Use new DataFrame implementation in crate::dataframe::base"
14)]
15pub use crate::dataframe::DataFrame as LegacyDataFrame;
16
17// Column trait to allow storing different Series types in the DataFrame
18trait ColumnAny: Debug + Send + Sync {
19    fn as_any(&self) -> &dyn Any;
20    fn len(&self) -> usize;
21    fn is_empty(&self) -> bool {
22        self.len() == 0
23    }
24    fn column_type_string(&self) -> String;
25    fn clone_box(&self) -> Box<dyn ColumnAny + Send + Sync>;
26}
27
28impl<T: 'static + Debug + Clone + Send + Sync> ColumnAny for crate::series::Series<T> {
29    fn as_any(&self) -> &dyn Any {
30        self
31    }
32
33    fn len(&self) -> usize {
34        self.len()
35    }
36
37    fn column_type_string(&self) -> String {
38        std::any::type_name::<T>().to_string()
39    }
40
41    fn clone_box(&self) -> Box<dyn ColumnAny + Send + Sync> {
42        Box::new(self.clone())
43    }
44}
45
46impl Clone for Box<dyn ColumnAny + Send + Sync> {
47    fn clone(&self) -> Self {
48        self.clone_box()
49    }
50}
51
52/// DataFrame struct: Column-oriented 2D data structure
53#[derive(Debug, Clone)]
54pub struct DataFrame {
55    // Actual fields for storage
56    columns: HashMap<String, Box<dyn ColumnAny + Send + Sync>>,
57    column_order: Vec<String>,
58    row_count: usize,
59}
60
61impl DataFrame {
62    /// Create a new empty DataFrame
63    pub fn new() -> Self {
64        Self {
65            columns: HashMap::new(),
66            column_order: Vec::new(),
67            row_count: 0,
68        }
69    }
70
71    /// Create a new DataFrame with a simple index
72    pub fn with_index(index: crate::index::Index<String>) -> Self {
73        let mut df = Self::new();
74        df.row_count = index.len();
75        df
76    }
77
78    /// Create a new DataFrame with a multi index
79    pub fn with_multi_index(multi_index: crate::index::MultiIndex<String>) -> Self {
80        let mut df = Self::new();
81        df.row_count = multi_index.len();
82        df
83    }
84
85    /// Check if the DataFrame contains a column with the given name
86    pub fn contains_column(&self, column_name: &str) -> bool {
87        self.columns.contains_key(column_name)
88    }
89
90    /// Get the number of rows in the DataFrame
91    pub fn row_count(&self) -> usize {
92        self.row_count
93    }
94
95    /// Get the number of rows (alias for compatibility)
96    pub fn nrows(&self) -> usize {
97        self.row_count
98    }
99
100    /// Get a string value from the DataFrame
101    pub fn get_string_value(&self, column_name: &str, row_idx: usize) -> Result<&str> {
102        // Check if column exists
103        let col = self
104            .columns
105            .get(column_name)
106            .ok_or_else(|| Error::ColumnNotFound(column_name.to_string()))?;
107
108        // Check if row index is valid
109        if row_idx >= self.row_count {
110            return Err(Error::InvalidValue(format!(
111                "Row index {} is out of bounds for DataFrame with {} rows",
112                row_idx, self.row_count
113            )));
114        }
115
116        // Try to downcast to Series<String> and get the value
117        if let Some(string_series) = col.as_any().downcast_ref::<crate::series::Series<String>>() {
118            if let Some(value) = string_series.get(row_idx) {
119                Ok(value)
120            } else {
121                Err(Error::InvalidValue(format!(
122                    "No value found at row {} in column '{}'",
123                    row_idx, column_name
124                )))
125            }
126        } else {
127            // If it's not a string column, try to convert other types to string
128            // But since we need to return &str, we can't create temporary strings
129            Err(Error::InvalidValue(format!(
130                "Column '{}' is not a string column. Use get_column_string_values() for type conversion.",
131                column_name
132            )))
133        }
134    }
135
136    /// Add a column to the DataFrame
137    pub fn add_column<T: 'static + Debug + Clone + Send + Sync>(
138        &mut self,
139        column_name: String,
140        series: crate::series::Series<T>,
141    ) -> Result<()> {
142        // Check if column already exists
143        if self.contains_column(&column_name) {
144            return Err(Error::DuplicateColumnName(column_name));
145        }
146
147        // Check length consistency
148        let series_len = series.len();
149        if !self.columns.is_empty() && series_len != self.row_count {
150            return Err(Error::InconsistentRowCount {
151                expected: self.row_count,
152                found: series_len,
153            });
154        }
155
156        // Add the column
157        self.columns.insert(column_name.clone(), Box::new(series));
158        self.column_order.push(column_name);
159
160        // Update row count if this is the first column
161        if self.row_count == 0 {
162            self.row_count = series_len;
163        }
164
165        Ok(())
166    }
167
168    /// Get column names in the DataFrame
169    pub fn column_names(&self) -> Vec<String> {
170        self.column_order.clone()
171    }
172
173    /// Rename columns in the DataFrame using a mapping
174    pub fn rename_columns(&mut self, column_map: &HashMap<String, String>) -> Result<()> {
175        // First, validate that all old column names exist
176        for old_name in column_map.keys() {
177            if !self.contains_column(old_name) {
178                return Err(Error::ColumnNotFound(old_name.clone()));
179            }
180        }
181
182        // Check for duplicate new names
183        let mut new_names_set = std::collections::HashSet::new();
184        for new_name in column_map.values() {
185            if !new_names_set.insert(new_name) {
186                return Err(Error::DuplicateColumnName(new_name.clone()));
187            }
188        }
189
190        // Check that new names don't conflict with existing column names (except those being renamed)
191        for new_name in column_map.values() {
192            if self.contains_column(new_name) && !column_map.contains_key(new_name) {
193                return Err(Error::DuplicateColumnName(new_name.clone()));
194            }
195        }
196
197        // Apply the renaming
198        for (old_name, new_name) in column_map {
199            // Update the column_order vector
200            if let Some(pos) = self.column_order.iter().position(|x| x == old_name) {
201                self.column_order[pos] = new_name.clone();
202            }
203
204            // Move the column data to the new key
205            if let Some(column_data) = self.columns.remove(old_name) {
206                self.columns.insert(new_name.clone(), column_data);
207            }
208        }
209
210        Ok(())
211    }
212
213    /// Set all column names in the DataFrame
214    pub fn set_column_names(&mut self, names: Vec<String>) -> Result<()> {
215        // Check that the number of names matches the number of columns
216        if names.len() != self.column_order.len() {
217            return Err(Error::InconsistentRowCount {
218                expected: self.column_order.len(),
219                found: names.len(),
220            });
221        }
222
223        // Check for duplicate names
224        let mut names_set = std::collections::HashSet::new();
225        for name in &names {
226            if !names_set.insert(name) {
227                return Err(Error::DuplicateColumnName(name.clone()));
228            }
229        }
230
231        // Create a mapping from old names to new names
232        let mut column_map = HashMap::new();
233        for (old_name, new_name) in self.column_order.iter().zip(names.iter()) {
234            column_map.insert(old_name.clone(), new_name.clone());
235        }
236
237        // Apply the renaming using the existing rename_columns method
238        self.rename_columns(&column_map)
239    }
240
241    /// Get a column from the DataFrame with generic type
242    pub fn get_column<T: 'static + Debug + Clone + Send + Sync>(
243        &self,
244        column_name: &str,
245    ) -> Result<&crate::series::Series<T>> {
246        let col = self
247            .columns
248            .get(column_name)
249            .ok_or_else(|| Error::ColumnNotFound(column_name.to_string()))?;
250
251        // Cast to the specific Series type
252        match col.as_any().downcast_ref::<crate::series::Series<T>>() {
253            Some(series) => Ok(series),
254            None => Err(Error::InvalidValue(format!(
255                "Column '{}' is not of the requested type",
256                column_name
257            ))),
258        }
259    }
260
261    /// Get string values from a column
262    pub fn get_column_string_values(&self, column_name: &str) -> Result<Vec<String>> {
263        if !self.contains_column(column_name) {
264            return Err(Error::ColumnNotFound(column_name.to_string()));
265        }
266
267        let column = self.columns.get(column_name).unwrap();
268
269        // Try to downcast to different Series types and convert to strings
270        if let Some(string_series) = column
271            .as_any()
272            .downcast_ref::<crate::series::Series<String>>()
273        {
274            Ok(string_series.values().to_vec())
275        } else if let Some(i32_series) =
276            column.as_any().downcast_ref::<crate::series::Series<i32>>()
277        {
278            Ok(i32_series
279                .values()
280                .iter()
281                .map(|v| ToString::to_string(v))
282                .collect())
283        } else if let Some(i64_series) =
284            column.as_any().downcast_ref::<crate::series::Series<i64>>()
285        {
286            Ok(i64_series
287                .values()
288                .iter()
289                .map(|v| ToString::to_string(v))
290                .collect())
291        } else if let Some(f32_series) =
292            column.as_any().downcast_ref::<crate::series::Series<f32>>()
293        {
294            Ok(f32_series
295                .values()
296                .iter()
297                .map(|v| ToString::to_string(v))
298                .collect())
299        } else if let Some(f64_series) =
300            column.as_any().downcast_ref::<crate::series::Series<f64>>()
301        {
302            Ok(f64_series
303                .values()
304                .iter()
305                .map(|v| ToString::to_string(v))
306                .collect())
307        } else if let Some(bool_series) = column
308            .as_any()
309            .downcast_ref::<crate::series::Series<bool>>()
310        {
311            Ok(bool_series
312                .values()
313                .iter()
314                .map(|v| ToString::to_string(v))
315                .collect())
316        } else {
317            // Fallback for unsupported types
318            let mut result = Vec::with_capacity(self.row_count);
319            for i in 0..self.row_count {
320                result.push(format!("unsupported_type_{}_{}", column_name, i));
321            }
322            Ok(result)
323        }
324    }
325
326    /// Get a column by index (compatibility method)
327    pub fn column_name(&self, idx: usize) -> Option<&String> {
328        self.column_order.get(idx)
329    }
330
331    /// Concat rows from another DataFrame
332    pub fn concat_rows(&self, _other: &DataFrame) -> Result<DataFrame> {
333        // Implement concatenation properly when needed
334        Ok(Self::new())
335    }
336
337    /// Convert DataFrame to CSV
338    pub fn to_csv<P: AsRef<Path>>(&self, _path: P) -> Result<()> {
339        // Implement CSV export when needed
340        Ok(())
341    }
342
343    /// Create DataFrame from CSV
344    pub fn from_csv<P: AsRef<Path>>(_path: P, _has_header: bool) -> Result<Self> {
345        // Implement CSV import when needed
346        Ok(Self::new())
347    }
348
349    /// Create DataFrame from CSV reader
350    pub fn from_csv_reader<R: std::io::Read>(
351        reader: &mut csv::Reader<R>,
352        has_header: bool,
353    ) -> Result<Self> {
354        let mut df = Self::new();
355
356        // Get headers
357        let headers: Vec<String> = if has_header {
358            reader
359                .headers()
360                .map_err(|e| Error::IoError(format!("CSV header error: {}", e)))?
361                .iter()
362                .map(|h| h.to_string())
363                .collect()
364        } else {
365            // Peek at first record to determine column count
366            let mut records = reader.records();
367            if let Some(first_record) = records.next() {
368                let record =
369                    first_record.map_err(|e| Error::IoError(format!("CSV read error: {}", e)))?;
370                (0..record.len()).map(|i| format!("column_{}", i)).collect()
371            } else {
372                return Ok(df); // Empty file
373            }
374        };
375
376        // Collect data for each column
377        let mut columns_data: std::collections::HashMap<String, Vec<String>> =
378            std::collections::HashMap::new();
379        for header in &headers {
380            columns_data.insert(header.clone(), Vec::new());
381        }
382
383        // Process records
384        for result in reader.records() {
385            let record = result.map_err(|e| Error::IoError(format!("CSV read error: {}", e)))?;
386            for (i, header) in headers.iter().enumerate() {
387                let value = if i < record.len() {
388                    record[i].to_string()
389                } else {
390                    String::new()
391                };
392                columns_data.get_mut(header).unwrap().push(value);
393            }
394        }
395
396        // Add columns to DataFrame
397        for header in headers {
398            if let Some(values) = columns_data.remove(&header) {
399                let series = crate::series::Series::new(values, Some(header.clone()))?;
400                df.add_column(header, series)?;
401            }
402        }
403
404        Ok(df)
405    }
406
407    /// Get the number of columns in the DataFrame
408    pub fn column_count(&self) -> usize {
409        self.columns.len()
410    }
411
412    /// Get the number of columns (alias for compatibility)
413    pub fn ncols(&self) -> usize {
414        self.column_count()
415    }
416
417    /// Create a new DataFrame with only the specified columns
418    pub fn select_columns(&self, columns: &[&str]) -> Result<Self> {
419        let result = Self::new();
420
421        for &column_name in columns {
422            if !self.contains_column(column_name) {
423                return Err(Error::ColumnNotFound(column_name.to_string()));
424            }
425
426            // For simplicity, we're just creating a stub result for now
427            // In a real implementation, we would copy over the actual column data
428        }
429
430        Ok(result)
431    }
432
433    /// Create a new DataFrame from a HashMap of column names to string vectors
434    pub fn from_map(
435        data: std::collections::HashMap<String, Vec<String>>,
436        index: Option<crate::index::Index<String>>,
437    ) -> Result<Self> {
438        let mut df = Self::new();
439
440        // If index is provided, set row count
441        if let Some(idx) = index {
442            df.row_count = idx.len();
443        } else {
444            // Otherwise, determine row count from data
445            df.row_count = data.values().map(|v| v.len()).max().unwrap_or(0);
446        }
447
448        // Add columns
449        for (col_name, values) in data {
450            // Create a Series of strings
451            let series = crate::series::Series::new(values, Some(col_name.clone()))?;
452            df.add_column(col_name, series)?;
453        }
454
455        Ok(df)
456    }
457
458    /// Create a new DataFrame from JSON string
459    /// Expects JSON format like: {"col1": ["val1", "val2"], "col2": ["val3", "val4"]}
460    pub fn from_json(json_str: &str) -> Result<Self> {
461        use serde_json::Value;
462
463        // Parse the JSON string
464        let parsed: Value = serde_json::from_str(json_str)
465            .map_err(|e| Error::InvalidInput(format!("Failed to parse JSON: {}", e)))?;
466
467        // Convert JSON object to HashMap
468        let mut data: std::collections::HashMap<String, Vec<String>> =
469            std::collections::HashMap::new();
470
471        if let Value::Object(obj) = parsed {
472            for (col_name, col_values) in obj {
473                if let Value::Array(values) = col_values {
474                    let string_values: Vec<String> = values
475                        .into_iter()
476                        .map(|v| match v {
477                            Value::String(s) => s,
478                            Value::Number(n) => n.to_string(),
479                            Value::Bool(b) => ToString::to_string(&b),
480                            Value::Null => "".to_string(),
481                            _ => v.to_string(),
482                        })
483                        .collect();
484                    data.insert(col_name, string_values);
485                } else {
486                    return Err(Error::InvalidInput(format!(
487                        "Column '{}' is not an array",
488                        col_name
489                    )));
490                }
491            }
492        } else {
493            return Err(Error::InvalidInput("JSON must be an object".to_string()));
494        }
495
496        // Use existing from_map method
497        Self::from_map(data, None)
498    }
499
500    /// Check if the DataFrame has the specified column (alias for contains_column)
501    pub fn has_column(&self, column_name: &str) -> bool {
502        self.contains_column(column_name)
503    }
504
505    /// Get the DataFrame's index
506    pub fn get_index(&self) -> crate::index::DataFrameIndex<String> {
507        // For now, we don't have an actual implementation of index in the new DataFrame
508        // So we return a default index structure
509        crate::index::DataFrameIndex::Simple(crate::index::Index::default())
510    }
511
512    /// Set the DataFrame's index from an Index
513    pub fn set_index(&mut self, index: crate::index::Index<String>) -> Result<()> {
514        // Stub implementation - would actually set the index
515        Ok(())
516    }
517
518    /// Set a multi-index for the DataFrame
519    pub fn set_multi_index(&mut self, multi_index: crate::index::MultiIndex<String>) -> Result<()> {
520        // Stub implementation - would actually set the multi-index
521        Ok(())
522    }
523
524    // Using the implementation at line 152 instead
525
526    /// Get numeric values from a column
527    pub fn get_column_numeric_values(&self, column_name: &str) -> Result<Vec<f64>> {
528        // Get the column
529        let col = self
530            .columns
531            .get(column_name)
532            .ok_or_else(|| Error::ColumnNotFound(column_name.to_string()))?;
533
534        // Extract numeric values
535        let mut values = Vec::with_capacity(self.row_count);
536        for i in 0..self.row_count {
537            // Try to get the value as a numeric type
538            let val = match col.as_any().downcast_ref::<crate::series::Series<f64>>() {
539                Some(float_series) => {
540                    if let Some(value) = float_series.get(i) {
541                        *value // Use the f64 value directly
542                    } else {
543                        return Err(Error::InvalidValue(format!(
544                            "Missing value at index {} in column '{}'",
545                            i, column_name
546                        )));
547                    }
548                }
549                None => {
550                    // Try other numeric types
551                    match col.as_any().downcast_ref::<crate::series::Series<i64>>() {
552                        Some(int_series) => {
553                            if let Some(value) = int_series.get(i) {
554                                *value as f64 // Convert i64 to f64
555                            } else {
556                                return Err(Error::InvalidValue(format!(
557                                    "Missing value at index {} in column '{}'",
558                                    i, column_name
559                                )));
560                            }
561                        }
562                        None => {
563                            // Try string values that might be parseable as numbers
564                            match col.as_any().downcast_ref::<crate::series::Series<String>>() {
565                                Some(str_series) => {
566                                    if let Some(value) = str_series.get(i) {
567                                        // Try to parse the string as a float
568                                        match value.parse::<f64>() {
569                                            Ok(num) => num,
570                                            Err(_) => return Err(Error::InvalidValue(format!(
571                                                "Value '{}' at index {} in column '{}' cannot be converted to numeric",
572                                                value, i, column_name
573                                            ))),
574                                        }
575                                    } else {
576                                        return Err(Error::InvalidValue(format!(
577                                            "Missing value at index {} in column '{}'",
578                                            i, column_name
579                                        )));
580                                    }
581                                }
582                                None => {
583                                    // If we can't find a suitable type, return an error
584                                    return Err(Error::InvalidValue(format!(
585                                        "Column '{}' cannot be converted to numeric values",
586                                        column_name
587                                    )));
588                                }
589                            }
590                        }
591                    }
592                }
593            };
594
595            values.push(val);
596        }
597
598        Ok(values)
599    }
600
601    /// Add a row to the DataFrame
602    pub fn add_row_data(&mut self, row_data: Vec<Box<dyn DValue>>) -> Result<()> {
603        // Check if the row size matches the number of columns
604        if row_data.len() != self.column_order.len() {
605            return Err(Error::InconsistentRowCount {
606                expected: self.column_order.len(),
607                found: row_data.len(),
608            });
609        }
610
611        // For now, just increase row count as we don't have a full implementation
612        self.row_count += 1;
613
614        Ok(())
615    }
616
617    /// Filter rows based on a predicate
618    pub fn filter<F>(&self, column_name: &str, predicate: F) -> Result<Self>
619    where
620        F: Fn(&Box<dyn DValue>) -> bool,
621    {
622        // Check if the column exists
623        if !self.contains_column(column_name) {
624            return Err(Error::ColumnNotFound(column_name.to_string()));
625        }
626
627        // For now, just return an empty DataFrame as we don't have a full implementation
628        Ok(Self::new())
629    }
630
631    /// Compute the mean of a column
632    pub fn mean(&self, column_name: &str) -> Result<f64> {
633        // Get numeric values from the column
634        let values = self.get_column_numeric_values(column_name)?;
635
636        if values.is_empty() {
637            return Err(Error::EmptySeries);
638        }
639
640        // Compute mean
641        let sum: f64 = values.iter().sum();
642        Ok(sum / values.len() as f64)
643    }
644
645    /// Group by a column
646    pub fn group_by(&self, _column_name: &str) -> Result<()> {
647        // Placeholder implementation
648        Ok(())
649    }
650
651    /// Enable GPU acceleration for a DataFrame
652    pub fn gpu_accelerate(&self) -> Result<Self> {
653        // For now, just return a clone as we don't have a full implementation
654        Ok(self.clone())
655    }
656
657    /// Calculate a correlation matrix
658    pub fn corr_matrix(&self, _columns: &[&str]) -> Result<()> {
659        // Placeholder implementation
660        Ok(())
661    }
662
663    /// Display the head of the DataFrame
664    pub fn head(&self, n: usize) -> Result<String> {
665        let mut result = String::new();
666
667        // Add header row
668        for col_name in &self.column_order {
669            result.push_str(&format!("{}\t", col_name));
670        }
671        result.push('\n');
672
673        // Add data rows (limited to n)
674        let row_limit = n.min(self.row_count);
675        for row_idx in 0..row_limit {
676            for col_name in &self.column_order {
677                // Simplistic approach - just add placeholder values
678                result.push_str("[val]\t");
679            }
680            result.push('\n');
681        }
682
683        Ok(result)
684    }
685
686    /// Add a row to the DataFrame using a HashMap of column names to values
687    pub fn add_row_data_from_hashmap(&mut self, row_data: HashMap<String, String>) -> Result<()> {
688        // Check if all required columns exist
689        for col_name in row_data.keys() {
690            if !self.contains_column(col_name) {
691                return Err(Error::ColumnNotFound(col_name.clone()));
692            }
693        }
694
695        // For now, just increment row count as we don't have a full implementation
696        self.row_count += 1;
697
698        Ok(())
699    }
700
701    /// Check if a column is categorical
702    pub fn is_categorical(&self, column_name: &str) -> bool {
703        // For simplicity, just check if it exists in this implementation
704        // A real implementation would check metadata or column type
705        self.contains_column(column_name)
706    }
707
708    /// Get a categorical column with generic type
709    pub fn sample(&self, indices: &[usize]) -> Result<Self> {
710        // Stub implementation - for compatibility only
711        Ok(Self::new())
712    }
713
714    /// Get a categorical column with generic type
715    pub fn get_categorical<T: 'static + Debug + Clone + Eq + std::hash::Hash + Send + Sync>(
716        &self,
717        column_name: &str,
718    ) -> Result<crate::series::categorical::Categorical<T>> {
719        // Check if the column exists
720        if !self.contains_column(column_name) {
721            return Err(Error::ColumnNotFound(column_name.to_string()));
722        }
723
724        // Get column data as strings
725        let values_str = self.get_column_string_values(column_name)?;
726
727        // This is a simplified implementation for backward compatibility
728        // It assumes T is String, which is the most common case for categorical data
729        if std::any::TypeId::of::<T>() == std::any::TypeId::of::<String>() {
730            // Create a vector of the appropriate type (safely)
731            let values: Vec<T> = unsafe { std::mem::transmute(values_str) };
732
733            // Create a new categorical with default settings
734            return crate::series::categorical::Categorical::new(values, None, false);
735        }
736
737        // For non-string types, return empty categorical
738        let empty_vec: Vec<T> = Vec::new();
739        crate::series::categorical::Categorical::new(empty_vec, None, false)
740    }
741
742    /// Check if a column is numeric
743    pub fn is_numeric_column(&self, column_name: &str) -> bool {
744        // Stub implementation - for compatibility only
745        false
746    }
747
748    /// Add a NASeries as a categorical column
749    pub fn add_na_series_as_categorical(
750        &mut self,
751        name: String,
752        series: crate::series::NASeries<String>,
753        categories: Option<Vec<String>>,
754        ordered: Option<crate::series::categorical::CategoricalOrder>,
755    ) -> Result<&mut Self> {
756        // Create a categorical from the NASeries
757        let cat = crate::series::categorical::StringCategorical::from_na_vec(
758            series.values().to_vec(),
759            categories,
760            ordered,
761        )?;
762
763        // Convert categorical to regular series
764        let regular_series = cat.to_series(Some(name.clone()))?;
765
766        // Add to DataFrame
767        self.add_column(name, regular_series)?;
768
769        Ok(self)
770    }
771
772    /// Create a DataFrame from multiple categorical data
773    pub fn from_categoricals(
774        categoricals: Vec<(String, crate::series::categorical::StringCategorical)>,
775    ) -> Result<Self> {
776        let mut df = Self::new();
777
778        // Check if all categorical data have the same length
779        if !categoricals.is_empty() {
780            let first_len = categoricals[0].1.len();
781            for (name, cat) in &categoricals {
782                if cat.len() != first_len {
783                    return Err(Error::InconsistentRowCount {
784                        expected: first_len,
785                        found: cat.len(),
786                    });
787                }
788            }
789        }
790
791        for (name, cat) in categoricals {
792            // Convert categorical to series
793            let series = cat.to_series(Some(name.clone()))?;
794
795            // Add as a column
796            df.add_column(name.clone(), series)?;
797        }
798
799        Ok(df)
800    }
801
802    /// Calculate the occurrence count of a column
803    pub fn value_counts(&self, column_name: &str) -> Result<crate::series::Series<usize>> {
804        // Check if the column exists
805        if !self.contains_column(column_name) {
806            return Err(Error::ColumnNotFound(column_name.to_string()));
807        }
808
809        // Get string values from the column
810        let values = self.get_column_string_values(column_name)?;
811
812        // Count occurrences
813        let mut counts = std::collections::HashMap::new();
814        for value in values {
815            *counts.entry(value).or_insert(0) += 1;
816        }
817
818        // Convert to vectors for Series
819        let mut values_vec = Vec::new();
820        let mut counts_vec = Vec::new();
821
822        for (value, count) in counts {
823            values_vec.push(value);
824            counts_vec.push(count);
825        }
826
827        // Create the Series object
828        crate::series::Series::new(counts_vec, Some(format!("{}_counts", column_name)))
829    }
830}