pandrs 0.3.0

A high-performance DataFrame library for Rust, providing pandas-like API with advanced features including SIMD optimization, parallel processing, and distributed computing capabilities
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
//! Multi-index column results for hierarchical GroupBy operations
//!
//! This module provides advanced multi-index column structures for presenting
//! hierarchical aggregation results in a pandas-like format with proper
//! column hierarchies and intuitive navigation.

use std::collections::{BTreeMap, HashMap};
use std::fmt;

use crate::core::error::{Error, OptionExt, Result};
use crate::dataframe::base::DataFrame;
use crate::dataframe::hierarchical_groupby::{HierarchicalAgg, HierarchicalKey};
use crate::series::base::Series;

/// Multi-level column index for hierarchical results
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct MultiIndexColumn {
    /// Column hierarchy levels (e.g., ["Sales", "Q1", "mean"] for Sales.Q1.mean)
    pub levels: Vec<String>,
    /// Level names for each hierarchy level
    pub level_names: Vec<String>,
    /// Data type of the column
    pub dtype: String,
}

impl MultiIndexColumn {
    /// Create a new multi-index column
    pub fn new(levels: Vec<String>, level_names: Vec<String>, dtype: String) -> Self {
        Self {
            levels,
            level_names,
            dtype,
        }
    }

    /// Get the display name for this column (flattened representation)
    pub fn display_name(&self) -> String {
        self.levels.join(".")
    }

    /// Get the leaf column name (last level)
    pub fn leaf_name(&self) -> &str {
        self.levels.last().map(|s| s.as_str()).unwrap_or("")
    }

    /// Get parent levels (all but the last)
    pub fn parent_levels(&self) -> &[String] {
        if self.levels.len() > 1 {
            &self.levels[..self.levels.len() - 1]
        } else {
            &[]
        }
    }

    /// Get the depth of the column hierarchy
    pub fn depth(&self) -> usize {
        self.levels.len()
    }

    /// Check if this column belongs to a specific parent hierarchy
    pub fn belongs_to_parent(&self, parent_levels: &[String]) -> bool {
        if parent_levels.len() >= self.levels.len() {
            return false;
        }

        self.levels[..parent_levels.len()] == *parent_levels
    }

    /// Create a new column with an additional level
    pub fn with_additional_level(&self, level: String) -> Self {
        let mut new_levels = self.levels.clone();
        new_levels.push(level);

        let mut new_level_names = self.level_names.clone();
        new_level_names.push(format!("level_{}", new_levels.len() - 1));

        Self::new(new_levels, new_level_names, self.dtype.clone())
    }
}

impl fmt::Display for MultiIndexColumn {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.display_name())
    }
}

/// Multi-index DataFrame for hierarchical results
#[derive(Debug, Clone)]
pub struct MultiIndexDataFrame {
    /// Underlying DataFrame with flattened column names
    data: DataFrame,
    /// Multi-index column specifications
    column_index: Vec<MultiIndexColumn>,
    /// Index hierarchy (row multi-index)
    index_hierarchy: Vec<String>,
    /// Column hierarchy level names
    column_level_names: Vec<String>,
    /// Metadata about the hierarchical structure
    metadata: MultiIndexMetadata,
}

/// Metadata about the multi-index structure
#[derive(Debug, Clone)]
pub struct MultiIndexMetadata {
    /// Maximum column hierarchy depth
    pub max_column_depth: usize,
    /// Number of index levels
    pub index_levels: usize,
    /// Total number of columns
    pub total_columns: usize,
    /// Aggregation functions used
    pub aggregation_functions: Vec<String>,
    /// Original grouping columns
    pub grouping_columns: Vec<String>,
}

impl MultiIndexDataFrame {
    /// Create a new multi-index DataFrame
    pub fn new(
        data: DataFrame,
        column_index: Vec<MultiIndexColumn>,
        index_hierarchy: Vec<String>,
    ) -> Self {
        let max_column_depth = column_index.iter().map(|c| c.depth()).max().unwrap_or(0);
        let index_levels = index_hierarchy.len();
        let total_columns = column_index.len();

        let column_level_names: Vec<String> = (0..max_column_depth)
            .map(|i| format!("level_{}", i))
            .collect();

        let metadata = MultiIndexMetadata {
            max_column_depth,
            index_levels,
            total_columns,
            aggregation_functions: Vec::new(),
            grouping_columns: index_hierarchy.clone(),
        };

        Self {
            data,
            column_index,
            index_hierarchy,
            column_level_names,
            metadata,
        }
    }

    /// Get the underlying DataFrame
    pub fn data(&self) -> &DataFrame {
        &self.data
    }

    /// Get the column index structure
    pub fn column_index(&self) -> &[MultiIndexColumn] {
        &self.column_index
    }

    /// Get the index hierarchy
    pub fn index_hierarchy(&self) -> &[String] {
        &self.index_hierarchy
    }

    /// Get metadata about the multi-index structure
    pub fn metadata(&self) -> &MultiIndexMetadata {
        &self.metadata
    }

    /// Select columns by level values
    pub fn select_columns_by_level(
        &self,
        level: usize,
        values: &[String],
    ) -> Result<MultiIndexDataFrame> {
        let selected_columns: Vec<_> = self
            .column_index
            .iter()
            .enumerate()
            .filter(|(_, col)| col.levels.get(level).map_or(false, |v| values.contains(v)))
            .collect();

        if selected_columns.is_empty() {
            return Err(Error::InvalidValue(
                "No columns match the selection criteria".to_string(),
            ));
        }

        // Create new DataFrame with selected columns
        let mut new_data = DataFrame::new();

        // Copy index columns
        for index_col in &self.index_hierarchy {
            if let Ok(series) = self.data.get_column::<String>(index_col) {
                new_data.add_column(index_col.clone(), series.clone())?;
            }
        }

        // Copy selected data columns
        let mut new_column_index = Vec::new();
        for (col_idx, multi_col) in selected_columns {
            let flat_name = multi_col.display_name();
            if let Ok(series) = self.data.get_column::<String>(&flat_name) {
                new_data.add_column(flat_name.clone(), series.clone())?;
                new_column_index.push(multi_col.clone());
            }
        }

        Ok(MultiIndexDataFrame::new(
            new_data,
            new_column_index,
            self.index_hierarchy.clone(),
        ))
    }

    /// Get columns that belong to a specific parent hierarchy
    pub fn get_columns_under_parent(&self, parent_levels: &[String]) -> Vec<&MultiIndexColumn> {
        self.column_index
            .iter()
            .filter(|col| col.belongs_to_parent(parent_levels))
            .collect()
    }

    /// Group columns by their parent levels up to a specific depth
    pub fn group_columns_by_parent(
        &self,
        depth: usize,
    ) -> BTreeMap<Vec<String>, Vec<&MultiIndexColumn>> {
        let mut groups: BTreeMap<Vec<String>, Vec<&MultiIndexColumn>> = BTreeMap::new();

        for column in &self.column_index {
            let parent_key = if depth < column.levels.len() {
                column.levels[..depth].to_vec()
            } else {
                column.levels.clone()
            };

            groups.entry(parent_key).or_default().push(column);
        }

        groups
    }

    /// Flatten the multi-index to a regular DataFrame with hierarchical column names
    pub fn flatten(&self) -> DataFrame {
        self.data.clone()
    }

    /// Get a hierarchical column summary
    pub fn column_summary(&self) -> ColumnHierarchySummary {
        let mut summary = ColumnHierarchySummary {
            total_columns: self.column_index.len(),
            max_depth: self.metadata.max_column_depth,
            levels_summary: BTreeMap::new(),
        };

        // Group by depth and analyze
        for column in &self.column_index {
            let depth = column.depth();
            let level_summary =
                summary
                    .levels_summary
                    .entry(depth)
                    .or_insert_with(|| LevelSummary {
                        count: 0,
                        unique_parents: std::collections::HashSet::new(),
                        sample_columns: Vec::new(),
                    });

            level_summary.count += 1;

            if column.depth() > 1 {
                level_summary
                    .unique_parents
                    .insert(column.parent_levels().join("."));
            }

            if level_summary.sample_columns.len() < 3 {
                level_summary.sample_columns.push(column.display_name());
            }
        }

        summary
    }

    /// Reshape to a different column hierarchy
    pub fn pivot_columns(&self, new_level_order: &[usize]) -> Result<MultiIndexDataFrame> {
        if new_level_order.len() != self.metadata.max_column_depth {
            return Err(Error::InvalidValue(
                "New level order must match column depth".to_string(),
            ));
        }

        // Reorder column levels
        let mut new_column_index = Vec::new();
        for column in &self.column_index {
            let mut new_levels = Vec::new();
            let mut new_level_names = Vec::new();

            for &level_idx in new_level_order {
                if level_idx < column.levels.len() {
                    new_levels.push(column.levels[level_idx].clone());
                    if level_idx < column.level_names.len() {
                        new_level_names.push(column.level_names[level_idx].clone());
                    }
                }
            }

            new_column_index.push(MultiIndexColumn::new(
                new_levels,
                new_level_names,
                column.dtype.clone(),
            ));
        }

        Ok(MultiIndexDataFrame::new(
            self.data.clone(),
            new_column_index,
            self.index_hierarchy.clone(),
        ))
    }

    /// Convert to a human-readable hierarchical table format
    pub fn to_hierarchical_string(&self, max_rows: Option<usize>) -> String {
        let mut output = String::new();

        // Header information
        output.push_str(&format!(
            "MultiIndex DataFrame ({}x{})\n",
            self.data.row_count(),
            self.column_index.len()
        ));
        output.push_str(&format!(
            "Index levels: {}\n",
            self.index_hierarchy.join(" | ")
        ));
        output.push_str(&format!(
            "Column hierarchy depth: {}\n",
            self.metadata.max_column_depth
        ));
        output.push_str("\n");

        // Column hierarchy display
        output.push_str("Column Hierarchy:\n");
        for (depth, columns) in self.group_columns_by_parent(1) {
            output.push_str(&format!("  {}: {}\n", depth.join("."), columns.len()));
        }
        output.push_str("\n");

        // Sample data
        let display_rows = max_rows.unwrap_or(10).min(self.data.row_count());
        output.push_str("Sample Data:\n");

        // Create column headers with hierarchy
        let mut header_lines: Vec<Vec<String>> = vec![Vec::new(); self.metadata.max_column_depth];

        // Add index headers
        for level in 0..self.metadata.max_column_depth {
            for index_col in &self.index_hierarchy {
                if level == 0 {
                    header_lines[level].push(index_col.clone());
                } else {
                    header_lines[level].push("".to_string());
                }
            }
        }

        // Add data column headers
        for column in &self.column_index {
            for level in 0..self.metadata.max_column_depth {
                if level < column.levels.len() {
                    header_lines[level].push(column.levels[level].clone());
                } else {
                    header_lines[level].push("".to_string());
                }
            }
        }

        // Print headers
        for (i, header_line) in header_lines.iter().enumerate() {
            output.push_str(&format!("Level {}: {}\n", i, header_line.join(" | ")));
        }
        output.push_str("\n");

        // Print data rows
        for row_idx in 0..display_rows {
            let mut row_data = Vec::new();

            // Add index data
            for index_col in &self.index_hierarchy {
                if let Ok(values) = self.data.get_column_string_values(index_col) {
                    if row_idx < values.len() {
                        row_data.push(values[row_idx].clone());
                    } else {
                        row_data.push("NULL".to_string());
                    }
                } else {
                    row_data.push("ERROR".to_string());
                }
            }

            // Add data columns
            for column in &self.column_index {
                let flat_name = column.display_name();
                if let Ok(values) = self.data.get_column_string_values(&flat_name) {
                    if row_idx < values.len() {
                        row_data.push(values[row_idx].clone());
                    } else {
                        row_data.push("NULL".to_string());
                    }
                } else {
                    row_data.push("ERROR".to_string());
                }
            }

            output.push_str(&format!("Row {}: {}\n", row_idx, row_data.join(" | ")));
        }

        if self.data.row_count() > display_rows {
            output.push_str(&format!(
                "... and {} more rows\n",
                self.data.row_count() - display_rows
            ));
        }

        output
    }
}

/// Summary of column hierarchy structure
#[derive(Debug)]
pub struct ColumnHierarchySummary {
    pub total_columns: usize,
    pub max_depth: usize,
    pub levels_summary: BTreeMap<usize, LevelSummary>,
}

/// Summary for a specific level in the hierarchy
#[derive(Debug)]
pub struct LevelSummary {
    pub count: usize,
    pub unique_parents: std::collections::HashSet<String>,
    pub sample_columns: Vec<String>,
}

/// Builder for creating multi-index DataFrames from hierarchical aggregations
#[derive(Debug)]
pub struct MultiIndexDataFrameBuilder {
    index_columns: Vec<String>,
    data_columns: HashMap<String, Series<String>>,
    column_hierarchy: Vec<MultiIndexColumn>,
    aggregation_metadata: Vec<String>,
}

impl MultiIndexDataFrameBuilder {
    /// Create a new builder
    pub fn new(index_columns: Vec<String>) -> Self {
        Self {
            index_columns,
            data_columns: HashMap::new(),
            column_hierarchy: Vec::new(),
            aggregation_metadata: Vec::new(),
        }
    }

    /// Add a data column with multi-index specification
    pub fn add_column(
        &mut self,
        column_spec: MultiIndexColumn,
        data: Series<String>,
    ) -> Result<()> {
        let flat_name = column_spec.display_name();
        self.data_columns.insert(flat_name, data);
        self.column_hierarchy.push(column_spec);
        Ok(())
    }

    /// Add multiple columns for a hierarchical aggregation
    pub fn add_hierarchical_aggregation(
        &mut self,
        base_column: &str,
        agg_func: &str,
        level_results: Vec<(usize, Series<String>)>,
    ) -> Result<()> {
        for (level, data) in level_results {
            let column_spec = MultiIndexColumn::new(
                vec![
                    base_column.to_string(),
                    format!("level_{}", level),
                    agg_func.to_string(),
                ],
                vec![
                    "column".to_string(),
                    "level".to_string(),
                    "function".to_string(),
                ],
                "f64".to_string(),
            );

            self.add_column(column_spec, data)?;
        }

        self.aggregation_metadata
            .push(format!("{}_{}", base_column, agg_func));
        Ok(())
    }

    /// Build the multi-index DataFrame
    pub fn build(self) -> Result<MultiIndexDataFrame> {
        let mut data = DataFrame::new();

        // Determine row count from data columns
        let row_count = if let Some(first_series) = self.data_columns.values().next() {
            first_series.len()
        } else {
            0
        };

        // Add index columns with the correct row count
        for index_col in &self.index_columns {
            // Create placeholder series with correct row count for testing
            let placeholder_data = vec!["placeholder".to_string(); row_count];
            let index_series: Series<String> =
                Series::new(placeholder_data, Some(index_col.clone()))?;
            data.add_column(index_col.clone(), index_series)?;
        }

        // Add data columns
        for (flat_name, series) in self.data_columns {
            data.add_column(flat_name, series)?;
        }

        Ok(MultiIndexDataFrame::new(
            data,
            self.column_hierarchy,
            self.index_columns,
        ))
    }
}

/// Extension trait for converting hierarchical results to multi-index format
pub trait ToMultiIndex {
    /// Convert hierarchical aggregation results to multi-index DataFrame
    fn to_multi_index(&self, agg_specs: &[HierarchicalAgg]) -> Result<MultiIndexDataFrame>;
}

impl ToMultiIndex for DataFrame {
    fn to_multi_index(&self, agg_specs: &[HierarchicalAgg]) -> Result<MultiIndexDataFrame> {
        let mut column_index = Vec::new();

        // Identify index columns (grouping columns)
        let mut index_columns = Vec::new();
        let column_names = self.column_names();

        // Find grouping columns (typically the first few columns)
        for col_name in &column_names {
            if !agg_specs.iter().any(|spec| {
                spec.level_functions
                    .iter()
                    .any(|(_, _, alias)| alias == col_name)
            }) {
                index_columns.push(col_name.clone());
            }
        }

        // Create multi-index columns for aggregation results
        for agg_spec in agg_specs {
            for (level, func, alias) in &agg_spec.level_functions {
                let column_spec = MultiIndexColumn::new(
                    vec![
                        agg_spec.column.clone(),
                        format!("level_{}", level),
                        func.as_str().to_string(),
                    ],
                    vec![
                        "column".to_string(),
                        "level".to_string(),
                        "function".to_string(),
                    ],
                    "f64".to_string(),
                );

                column_index.push(column_spec);
            }
        }

        Ok(MultiIndexDataFrame::new(
            self.clone(),
            column_index,
            index_columns,
        ))
    }
}

/// Utility functions for multi-index operations
pub mod utils {
    use super::*;

    /// Create a simple multi-index column specification
    pub fn simple_multi_index_column(levels: Vec<&str>, dtype: &str) -> MultiIndexColumn {
        let level_names = (0..levels.len()).map(|i| format!("level_{}", i)).collect();

        MultiIndexColumn::new(
            levels.into_iter().map(|s| s.to_string()).collect(),
            level_names,
            dtype.to_string(),
        )
    }

    /// Create column hierarchy for common aggregation patterns
    pub fn create_aggregation_hierarchy(
        columns: &[&str],
        functions: &[&str],
    ) -> Vec<MultiIndexColumn> {
        let mut result = Vec::new();

        for &column in columns {
            for &function in functions {
                result.push(simple_multi_index_column(vec![column, function], "f64"));
            }
        }

        result
    }

    /// Merge multiple multi-index DataFrames
    pub fn merge_multi_index_dataframes(
        dataframes: Vec<MultiIndexDataFrame>,
    ) -> Result<MultiIndexDataFrame> {
        if dataframes.is_empty() {
            return Err(Error::InvalidValue(
                "Cannot merge empty list of DataFrames".to_string(),
            ));
        }

        if dataframes.len() == 1 {
            return dataframes.into_iter().next().ok_or_else(|| {
                Error::InsufficientData("dataframes vec should have one element".to_string())
            });
        }

        // For now, return the first DataFrame
        // In a full implementation, this would merge the DataFrames properly
        dataframes
            .into_iter()
            .next()
            .ok_or_else(|| Error::InsufficientData("dataframes should not be empty".to_string()))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_multi_index_column() {
        let column = MultiIndexColumn::new(
            vec!["Sales".to_string(), "Q1".to_string(), "mean".to_string()],
            vec![
                "metric".to_string(),
                "period".to_string(),
                "function".to_string(),
            ],
            "f64".to_string(),
        );

        assert_eq!(column.display_name(), "Sales.Q1.mean");
        assert_eq!(column.leaf_name(), "mean");
        assert_eq!(column.depth(), 3);
        assert_eq!(column.parent_levels(), &["Sales", "Q1"]);

        assert!(column.belongs_to_parent(&["Sales".to_string()]));
        assert!(column.belongs_to_parent(&["Sales".to_string(), "Q1".to_string()]));
        assert!(!column.belongs_to_parent(&["Revenue".to_string()]));
    }

    #[test]
    fn test_multi_index_dataframe_builder() {
        let mut builder = MultiIndexDataFrameBuilder::new(vec!["region".to_string()]);

        let column_spec = utils::simple_multi_index_column(vec!["sales", "mean"], "f64");
        let data = Series::new(vec!["100.0".to_string()], Some("sales.mean".to_string()))
            .expect("operation should succeed");

        builder
            .add_column(column_spec, data)
            .expect("operation should succeed");

        let multi_df = builder.build().expect("operation should succeed");
        assert_eq!(multi_df.column_index().len(), 1);
        assert_eq!(multi_df.index_hierarchy(), &["region"]);
    }

    #[test]
    fn test_column_hierarchy_summary() {
        let columns = vec![
            utils::simple_multi_index_column(vec!["sales", "mean"], "f64"),
            utils::simple_multi_index_column(vec!["sales", "sum"], "f64"),
            utils::simple_multi_index_column(vec!["quantity", "count"], "i64"),
        ];

        let data = DataFrame::new();
        let multi_df = MultiIndexDataFrame::new(data, columns, vec!["region".to_string()]);

        let summary = multi_df.column_summary();
        assert_eq!(summary.total_columns, 3);
        assert_eq!(summary.max_depth, 2);
        assert!(summary.levels_summary.contains_key(&2));
    }
}