etop_format/table_formats/
dataframe_format.rs

1use crate::{ColumnFormat, ColumnFormatShorthand, FormatError};
2use polars::prelude::*;
3
4const DEFAULT_TABLE_HEIGHT: usize = 30;
5
6/// dataframe format
7#[derive(Debug)]
8pub struct DataFrameFormat {
9    /// column formats
10    pub column_formats: Option<Vec<ColumnFormatShorthand>>,
11    /// column delimiter
12    pub column_delimiter: String,
13    /// header separator delimiter
14    pub header_separator_delimiter: String,
15    /// header separator char
16    pub header_separator_char: char,
17    /// include header row
18    pub include_header_row: bool,
19    /// include header separator row
20    pub include_header_separator_row: bool,
21    /// include summary row
22    pub include_summary_row: bool,
23    /// include summary separator row
24    pub include_summary_separator_row: bool,
25    /// render height
26    pub render_height: Option<usize>,
27    /// max render width
28    pub max_render_width: Option<usize>,
29}
30
31impl Default for DataFrameFormat {
32    fn default() -> DataFrameFormat {
33        DataFrameFormat {
34            column_formats: None,
35            column_delimiter: "  │  ".to_string(),
36            header_separator_delimiter: "──┼──".to_string(),
37            header_separator_char: '─',
38            include_header_row: true,
39            include_header_separator_row: true,
40            include_summary_row: false,
41            include_summary_separator_row: false,
42            render_height: None,
43            max_render_width: None,
44        }
45    }
46}
47
48/// finalized DataFrameFormat
49#[derive(Debug)]
50pub struct DataFrameFormatFinal {
51    /// column formats
52    pub column_formats: Vec<ColumnFormat>,
53    /// column delimiter
54    pub column_delimiter: String,
55    /// header separator delimiter
56    pub header_separator_delimiter: String,
57    /// header separator char
58    pub header_separator_char: char,
59    /// include header row
60    pub include_header_row: bool,
61    /// include header separator row
62    pub include_header_separator_row: bool,
63    /// include summary row
64    pub include_summary_row: bool,
65    /// include summary separator row
66    pub include_summary_separator_row: bool,
67    /// render height
68    pub render_height: usize,
69    /// max render width
70    pub max_render_width: usize,
71}
72
73impl DataFrameFormat {
74    /// format dataframe as String
75    pub fn format(&self, df: DataFrame) -> Result<String, FormatError> {
76        let fmt = self.finalize(df.clone())?;
77        fmt.format(df)
78    }
79
80    /// fill missing format information based on dataframe
81    fn finalize(&self, df: DataFrame) -> Result<DataFrameFormatFinal, FormatError> {
82        let schema = df.schema();
83        let column_formats: Vec<ColumnFormat> = match &self.column_formats {
84            Some(cols) => {
85                let mut fmts = Vec::new();
86                for col in cols.iter() {
87                    let dtype = match schema.get_field(col.name.as_str()) {
88                        Some(field) => field.dtype,
89                        None => {
90                            return Err(FormatError::ColumnMissing(format!(
91                                "missing column: {}",
92                                col.name
93                            )))
94                        }
95                    };
96                    fmts.push(col.clone().finalize(&dtype)?);
97                }
98                fmts
99            }
100            None => {
101                let fmts: Result<Vec<ColumnFormat>, FormatError> = schema
102                    .iter()
103                    .map(|(name, dtype)| ColumnFormatShorthand::new().name(name).finalize(dtype))
104                    .collect();
105                fmts?
106            }
107        };
108
109        let max_render_width = match self.max_render_width {
110            Some(value) => value,
111            None => {
112                let max_render_width = safe_sum_with_max_on_overflow(
113                    column_formats.iter().map(|c| c.get_max_width()).collect(),
114                );
115                safe_sum_with_max_on_overflow(vec![
116                    max_render_width,
117                    self.column_delimiter.chars().count() * (column_formats.len() - 1),
118                ])
119            }
120        };
121        let fmt = DataFrameFormatFinal {
122            column_formats,
123            column_delimiter: self.column_delimiter.clone(),
124            header_separator_delimiter: self.header_separator_delimiter.clone(),
125            header_separator_char: self.header_separator_char,
126            include_header_row: self.include_header_row,
127            include_header_separator_row: self.include_header_separator_row,
128            include_summary_row: self.include_summary_row,
129            include_summary_separator_row: self.include_summary_separator_row,
130            render_height: self.render_height.unwrap_or(DEFAULT_TABLE_HEIGHT),
131            max_render_width,
132        };
133        Ok(fmt)
134    }
135}
136
137fn safe_sum_with_max_on_overflow(numbers: Vec<usize>) -> usize {
138    let mut sum: usize = 0;
139    for number in numbers {
140        match sum.checked_add(number) {
141            Some(s) => sum = s,
142            None => return usize::MAX,
143        };
144    }
145    sum
146}
147
148// get number of lines in header
149impl DataFrameFormatFinal {
150    fn n_header_lines(&self) -> usize {
151        // TODO: take an n_used_columns parameter, for if only subset of columns used
152        self.column_formats
153            .iter()
154            .map(|f| f.display_name.chars().filter(|&c| c == '\n').count() + 1)
155            .max()
156            .unwrap_or(0)
157    }
158
159    fn n_data_rows(&self) -> usize {
160        self.render_height -
161            (self.include_header_row as usize) *
162                (self.n_header_lines() + (self.include_header_separator_row as usize)) -
163            (self.include_summary_row as usize) *
164                (1 + (self.include_summary_separator_row as usize))
165    }
166
167    fn total_rendered_width(&self, used_widths: &Vec<usize>) -> usize {
168        used_widths.iter().sum::<usize>() +
169            ((used_widths.len() as i64 - 1).max(0) as usize) *
170                self.column_delimiter.chars().count()
171    }
172
173    fn render_header_rows(&self, used_widths: &[usize], total_width: usize) -> Vec<String> {
174        let n_header_lines = self.n_header_lines();
175        let mut rows: Vec<String> =
176            (0..n_header_lines).map(|_| String::with_capacity(total_width)).collect();
177        for (c, width) in used_widths.iter().enumerate() {
178            if c != 0 {
179                for row in rows.iter_mut() {
180                    row.push_str(self.column_delimiter.as_str());
181                }
182            }
183            let name = self.column_formats[c].display_name.as_str();
184            let lines: Vec<String> = name.split('\n').map(|s| s.to_string()).collect();
185            let bound = n_header_lines - lines.len();
186            for row in rows.iter_mut().take(bound) {
187                row.push_str(" ".repeat(*width).as_str());
188            }
189            for (row, line) in rows.iter_mut().skip(bound).zip(lines) {
190                row.push_str(format!("{:>width$}", line, width = width).as_str());
191            }
192        }
193
194        rows
195    }
196
197    fn render_header_separator_row(&self, used_widths: &[usize], total_width: usize) -> String {
198        let mut row = String::with_capacity(total_width);
199        let separator = self.header_separator_char.to_string();
200        for (c, width) in used_widths.iter().enumerate() {
201            if c != 0 {
202                row.push_str(self.header_separator_delimiter.as_str());
203            }
204            row.push_str(separator.repeat(*width).as_str());
205        }
206        row
207    }
208
209    fn render_columns(&self, df: DataFrame) -> Result<(Vec<usize>, Vec<Vec<String>>), FormatError> {
210        // compute global sizes
211        let mut column_min_widths: Vec<usize> = vec![];
212        let mut column_max_widths: Vec<usize> = vec![];
213        for fmt in self.column_formats.iter() {
214            let min_width = fmt.header_width().max(fmt.get_min_width());
215            let max_width = fmt.get_max_width();
216            if min_width > max_width {
217                let msg = format!("min_width > max_width for column: {}", fmt.display_name);
218                return Err(FormatError::InvalidFormat(msg));
219            }
220            column_min_widths.push(min_width);
221            column_max_widths.push(max_width);
222        }
223
224        let total_min_width = column_min_widths.iter().sum::<usize>() +
225            self.column_delimiter.chars().count() * (self.column_formats.len() - 1);
226        // let total_max_width = column_max_widths.iter().sum::<usize>();
227
228        // compute how many columns to include
229        let n_used_columns = if total_min_width >= self.max_render_width {
230            let mut n_used_columns = 0;
231            let mut used_width = 0;
232            for min_width in column_min_widths.iter() {
233                if used_width > 0 {
234                    used_width += self.column_delimiter.chars().count();
235                }
236                if used_width + min_width <= self.max_render_width {
237                    n_used_columns += 1;
238                    used_width += min_width;
239                } else {
240                    break;
241                }
242            }
243            n_used_columns
244        } else {
245            self.column_formats.len()
246        };
247        // let column_min_widths = column_min_widths.into_iter().take(n_used_columns);
248        // let column_max_widths = column_max_widths.into_iter().take(n_used_columns);
249
250        // compute used widths
251        let mut columns = Vec::with_capacity(n_used_columns);
252        let mut used_widths = Vec::with_capacity(n_used_columns);
253        let mut spare_room: usize = self.max_render_width -
254            column_min_widths.iter().take(n_used_columns).sum::<usize>() -
255            self.column_delimiter.chars().count() * ((n_used_columns as i64 - 1).max(0) as usize);
256        // println!("COLUMN_MIN_WIDTHS {:?}", column_min_widths);
257        // println!("TOTAL_MIN_WIDTHS {}",
258        // column_min_widths.iter().take(n_used_columns).sum::<usize>());
259        // println!("SPARE_ROOM {}", spare_room);
260        // println!("MAX_RENDER_WIDTH {:?}", self.max_render_width);
261        // println!(
262        //     "MIN_TOTAL_W_DELIM {:?}",
263        //     column_min_widths.iter().take(n_used_columns).sum::<usize>()
264        //         + self.column_delimiter.chars().count()
265        //             * ((n_used_columns as i64 - 1).max(0) as usize)
266        // );
267        // println!();
268
269        for (c, column_format) in self.column_formats.iter().take(n_used_columns).enumerate() {
270            if let (0, _) = df.shape() {
271                used_widths.push(column_min_widths[c]);
272                columns.push(vec![]);
273                continue
274            }
275
276            let min_width = column_min_widths[c];
277            let max_width = column_max_widths[c].min(min_width + spare_room);
278            let column = column_format
279                .clone()
280                .min_width(min_width)
281                .max_width(max_width)
282                .format(df.column(column_format.name.as_str())?)?;
283            let used_width = column
284                .iter()
285                .map(|s| s.chars().count())
286                // .map(|s| unicode_width::UnicodeWidthStr::width_cjk(s.as_str()))
287                // .map(|s| unicode_width::UnicodeWidthStr::width(s.as_str()))
288                .max()
289                .ok_or(FormatError::EmptyData(format!("empty column: {}", column_format.name)))?;
290            columns.push(column);
291            // println!("NAME {}", column_format.name);
292            // println!("FORMAT {:?}", column_format);
293            // println!("MAX_WIDTH {}", max_width);
294            // println!("MIN_WIDTH {}", min_width);
295            // println!("SPARE_ROOM {}", spare_room);
296            // println!("USED_WIDTH {}", used_width);
297            // println!("NEW_SPARE_ROOM {}", spare_room - (used_width - min_width));
298            // println!();
299            used_widths.push(used_width);
300            spare_room -= used_width - min_width;
301        }
302        Ok((used_widths, columns))
303    }
304
305    fn assemble_rows(&self, columns: Vec<Vec<String>>, rows: &mut Vec<String>, total_width: usize) {
306        let n_data_rows = match columns.first() {
307            Some(column) => column.len(),
308            None => return,
309        };
310        // println!("N_DATA_ROWS: {}", n_data_rows);
311        for r in 0..n_data_rows {
312            let mut row = String::with_capacity(total_width);
313            for (c, column) in columns.iter().enumerate() {
314                if c != 0 {
315                    row.push_str(self.column_delimiter.as_str())
316                }
317                row.push_str(column[r].as_str())
318            }
319            rows.push(row)
320        }
321    }
322
323    pub(crate) fn format(&self, df: DataFrame) -> Result<String, FormatError> {
324        // clip
325        let n_data_rows = self.n_data_rows();
326        let df = df.clone().slice(0, n_data_rows);
327
328        // render columns
329        let (used_widths, columns) = self.render_columns(df)?;
330        let total_width = self.total_rendered_width(&used_widths);
331
332        // assemble rows
333        let mut rows = Vec::with_capacity(self.render_height);
334        if self.include_header_row {
335            for row in self.render_header_rows(&used_widths, total_width) {
336                rows.push(row);
337            }
338            if self.include_header_separator_row {
339                rows.push(self.render_header_separator_row(&used_widths, total_width));
340            }
341        };
342        self.assemble_rows(columns, &mut rows, total_width);
343        if self.include_summary_row {
344            todo!("summary row")
345        }
346
347        Ok(rows.join("\n"))
348    }
349}
350
351// // build header row
352// let n_rows = self.n_rows.unwrap_or_else(|| df.height().min(20));
353// let widths = determine_widths(&df, &columns)?;
354// let total_width = widths.iter().sum();
355// let mut header = String::with_capacity(total_width);
356// let column_delimiter = self.column_delimiter.clone().unwrap_or(" ".to_string());
357// for (i, (column, width)) in columns.iter().zip(widths).enumerate() {
358//     header.push_str(format!("{:>width$}", column.display_name, width = width).as_str());
359//     if i != columns.len() - 1 {
360//         header.push_str(column_delimiter.as_str());
361//     }
362// }
363
364// // convert numeric fields to float64
365// for (name, dtype) in df.schema().iter() {
366//     if dtype.is_numeric() {
367//         df = df
368//             .clone()
369//             .with_column(df.column(name)?.to_float()?)?
370//             .clone();
371//     }
372// }
373
374// // print each row
375// let mut rows = vec![];
376// rows.push(header);
377// for r in 0..n_rows {
378//     let mut row = String::new();
379//     for (c, column_format) in columns.iter().enumerate() {
380//         let df = df.clone();
381//         let column = df.column(column_format.name.as_str())?;
382//         let cell = format_cell(column, column_format, r)?;
383//         row.push_str(cell.as_str());
384//         if c != columns.len() - 1 {
385//             row.push_str(column_delimiter.as_str());
386//         }
387//     }
388//     rows.push(row);
389// }
390
391// Ok(rows.join("\n"))
392
393// fn format_cell(
394//     column: &Series,
395//     column_format: &ColumnFormat,
396//     r: usize,
397// ) -> Result<String, FormatError> {
398//     match column.dtype() {
399//         DataType::Binary => match column.binary()?.get(r) {
400//             Some(binary) => Ok(column_format.binary_format()?.format(binary)?),
401//             None => Ok("-".into()),
402//         },
403//         DataType::Utf8 => Ok(column.str_value(r)?.to_string()),
404//         DataType::Float64 => match column.f64()?.get(r) {
405//             Some(number) => Ok(column_format.number_format()?.format(number)?),
406//             None => Ok("-".into()),
407//         },
408//         DataType::Boolean => match column.bool()?.get(r) {
409//             Some(true) => Ok("yes".to_string()),
410//             Some(false) => Ok("no".to_string()),
411//             None => Ok("-".to_string()),
412//         },
413//         dtype => {
414//             let message = format!("column {} has type {}", column.name(), dtype);
415//             Err(FormatError::UnsupportedDatatype(message))
416//         }
417//     }
418// }
419
420// pub(crate) fn determine_widths(
421//     df: &DataFrame,
422//     columns: &Vec<ColumnFormat>,
423// ) -> Result<Vec<usize>, FormatError> {
424//     let mut widths = Vec::with_capacity(columns.len());
425//     for column in columns.iter() {
426//         match column.min_width {
427//             Some(min_width) => widths.push(min_width),
428//             None => match df.schema().get(column.name.as_str()) {
429//                 Some(dtype) => widths.push(get_dtype_default_width(dtype)),
430//                 None => return Err(FormatError::ColumnMissing(column.name.to_string())),
431//             },
432//         }
433//     }
434//     Ok(widths)
435// }
436
437// pub(crate) fn get_dtype_default_width(dtype: &DataType) -> usize {
438//     match dtype {
439//         DataType::Boolean => 12,
440//         DataType::UInt8 => 12,
441//         DataType::UInt16 => 12,
442//         DataType::UInt32 => 12,
443//         DataType::UInt64 => 12,
444//         DataType::Int8 => 12,
445//         DataType::Int16 => 12,
446//         DataType::Int32 => 12,
447//         DataType::Int64 => 12,
448//         DataType::Float32 => 12,
449//         DataType::Float64 => 12,
450//         // DataType::Decimal(_precision, _scale) => 12,
451//         DataType::Utf8 => 12,
452//         DataType::Binary => 12,
453//         DataType::Date => 12,
454//         DataType::Datetime(_, _) => 12,
455//         DataType::Duration(_unit) => 12,
456//         DataType::Time => 12,
457//         // DataType::Array(_datatype, _size) => 12,
458//         DataType::List(_datatype) => 12,
459//         // DataType::Object(_) => 12,
460//         DataType::Null => 12,
461//         // DataType::Categorical(_) => 12,
462//         DataType::Struct(_fields) => 12,
463//         DataType::Unknown => 12,
464//     }
465// }