etop_format/table_formats/dataframe_format.rs
1use crate::{ColumnFormat, ColumnFormatShorthand, FormatError};
2use polars::prelude::*;
3
4const DEFAULT_TABLE_HEIGHT: usize = 30;
5
6/// dataframe format
7#[derive(Debug)]
8pub struct DataFrameFormat {
9 /// column formats
10 pub column_formats: Option<Vec<ColumnFormatShorthand>>,
11 /// column delimiter
12 pub column_delimiter: String,
13 /// header separator delimiter
14 pub header_separator_delimiter: String,
15 /// header separator char
16 pub header_separator_char: char,
17 /// include header row
18 pub include_header_row: bool,
19 /// include header separator row
20 pub include_header_separator_row: bool,
21 /// include summary row
22 pub include_summary_row: bool,
23 /// include summary separator row
24 pub include_summary_separator_row: bool,
25 /// render height
26 pub render_height: Option<usize>,
27 /// max render width
28 pub max_render_width: Option<usize>,
29}
30
31impl Default for DataFrameFormat {
32 fn default() -> DataFrameFormat {
33 DataFrameFormat {
34 column_formats: None,
35 column_delimiter: " │ ".to_string(),
36 header_separator_delimiter: "──┼──".to_string(),
37 header_separator_char: '─',
38 include_header_row: true,
39 include_header_separator_row: true,
40 include_summary_row: false,
41 include_summary_separator_row: false,
42 render_height: None,
43 max_render_width: None,
44 }
45 }
46}
47
48/// finalized DataFrameFormat
49#[derive(Debug)]
50pub struct DataFrameFormatFinal {
51 /// column formats
52 pub column_formats: Vec<ColumnFormat>,
53 /// column delimiter
54 pub column_delimiter: String,
55 /// header separator delimiter
56 pub header_separator_delimiter: String,
57 /// header separator char
58 pub header_separator_char: char,
59 /// include header row
60 pub include_header_row: bool,
61 /// include header separator row
62 pub include_header_separator_row: bool,
63 /// include summary row
64 pub include_summary_row: bool,
65 /// include summary separator row
66 pub include_summary_separator_row: bool,
67 /// render height
68 pub render_height: usize,
69 /// max render width
70 pub max_render_width: usize,
71}
72
73impl DataFrameFormat {
74 /// format dataframe as String
75 pub fn format(&self, df: DataFrame) -> Result<String, FormatError> {
76 let fmt = self.finalize(df.clone())?;
77 fmt.format(df)
78 }
79
80 /// fill missing format information based on dataframe
81 fn finalize(&self, df: DataFrame) -> Result<DataFrameFormatFinal, FormatError> {
82 let schema = df.schema();
83 let column_formats: Vec<ColumnFormat> = match &self.column_formats {
84 Some(cols) => {
85 let mut fmts = Vec::new();
86 for col in cols.iter() {
87 let dtype = match schema.get_field(col.name.as_str()) {
88 Some(field) => field.dtype,
89 None => {
90 return Err(FormatError::ColumnMissing(format!(
91 "missing column: {}",
92 col.name
93 )))
94 }
95 };
96 fmts.push(col.clone().finalize(&dtype)?);
97 }
98 fmts
99 }
100 None => {
101 let fmts: Result<Vec<ColumnFormat>, FormatError> = schema
102 .iter()
103 .map(|(name, dtype)| ColumnFormatShorthand::new().name(name).finalize(dtype))
104 .collect();
105 fmts?
106 }
107 };
108
109 let max_render_width = match self.max_render_width {
110 Some(value) => value,
111 None => {
112 let max_render_width = safe_sum_with_max_on_overflow(
113 column_formats.iter().map(|c| c.get_max_width()).collect(),
114 );
115 safe_sum_with_max_on_overflow(vec![
116 max_render_width,
117 self.column_delimiter.chars().count() * (column_formats.len() - 1),
118 ])
119 }
120 };
121 let fmt = DataFrameFormatFinal {
122 column_formats,
123 column_delimiter: self.column_delimiter.clone(),
124 header_separator_delimiter: self.header_separator_delimiter.clone(),
125 header_separator_char: self.header_separator_char,
126 include_header_row: self.include_header_row,
127 include_header_separator_row: self.include_header_separator_row,
128 include_summary_row: self.include_summary_row,
129 include_summary_separator_row: self.include_summary_separator_row,
130 render_height: self.render_height.unwrap_or(DEFAULT_TABLE_HEIGHT),
131 max_render_width,
132 };
133 Ok(fmt)
134 }
135}
136
137fn safe_sum_with_max_on_overflow(numbers: Vec<usize>) -> usize {
138 let mut sum: usize = 0;
139 for number in numbers {
140 match sum.checked_add(number) {
141 Some(s) => sum = s,
142 None => return usize::MAX,
143 };
144 }
145 sum
146}
147
148// get number of lines in header
149impl DataFrameFormatFinal {
150 fn n_header_lines(&self) -> usize {
151 // TODO: take an n_used_columns parameter, for if only subset of columns used
152 self.column_formats
153 .iter()
154 .map(|f| f.display_name.chars().filter(|&c| c == '\n').count() + 1)
155 .max()
156 .unwrap_or(0)
157 }
158
159 fn n_data_rows(&self) -> usize {
160 self.render_height -
161 (self.include_header_row as usize) *
162 (self.n_header_lines() + (self.include_header_separator_row as usize)) -
163 (self.include_summary_row as usize) *
164 (1 + (self.include_summary_separator_row as usize))
165 }
166
167 fn total_rendered_width(&self, used_widths: &Vec<usize>) -> usize {
168 used_widths.iter().sum::<usize>() +
169 ((used_widths.len() as i64 - 1).max(0) as usize) *
170 self.column_delimiter.chars().count()
171 }
172
173 fn render_header_rows(&self, used_widths: &[usize], total_width: usize) -> Vec<String> {
174 let n_header_lines = self.n_header_lines();
175 let mut rows: Vec<String> =
176 (0..n_header_lines).map(|_| String::with_capacity(total_width)).collect();
177 for (c, width) in used_widths.iter().enumerate() {
178 if c != 0 {
179 for row in rows.iter_mut() {
180 row.push_str(self.column_delimiter.as_str());
181 }
182 }
183 let name = self.column_formats[c].display_name.as_str();
184 let lines: Vec<String> = name.split('\n').map(|s| s.to_string()).collect();
185 let bound = n_header_lines - lines.len();
186 for row in rows.iter_mut().take(bound) {
187 row.push_str(" ".repeat(*width).as_str());
188 }
189 for (row, line) in rows.iter_mut().skip(bound).zip(lines) {
190 row.push_str(format!("{:>width$}", line, width = width).as_str());
191 }
192 }
193
194 rows
195 }
196
197 fn render_header_separator_row(&self, used_widths: &[usize], total_width: usize) -> String {
198 let mut row = String::with_capacity(total_width);
199 let separator = self.header_separator_char.to_string();
200 for (c, width) in used_widths.iter().enumerate() {
201 if c != 0 {
202 row.push_str(self.header_separator_delimiter.as_str());
203 }
204 row.push_str(separator.repeat(*width).as_str());
205 }
206 row
207 }
208
209 fn render_columns(&self, df: DataFrame) -> Result<(Vec<usize>, Vec<Vec<String>>), FormatError> {
210 // compute global sizes
211 let mut column_min_widths: Vec<usize> = vec![];
212 let mut column_max_widths: Vec<usize> = vec![];
213 for fmt in self.column_formats.iter() {
214 let min_width = fmt.header_width().max(fmt.get_min_width());
215 let max_width = fmt.get_max_width();
216 if min_width > max_width {
217 let msg = format!("min_width > max_width for column: {}", fmt.display_name);
218 return Err(FormatError::InvalidFormat(msg));
219 }
220 column_min_widths.push(min_width);
221 column_max_widths.push(max_width);
222 }
223
224 let total_min_width = column_min_widths.iter().sum::<usize>() +
225 self.column_delimiter.chars().count() * (self.column_formats.len() - 1);
226 // let total_max_width = column_max_widths.iter().sum::<usize>();
227
228 // compute how many columns to include
229 let n_used_columns = if total_min_width >= self.max_render_width {
230 let mut n_used_columns = 0;
231 let mut used_width = 0;
232 for min_width in column_min_widths.iter() {
233 if used_width > 0 {
234 used_width += self.column_delimiter.chars().count();
235 }
236 if used_width + min_width <= self.max_render_width {
237 n_used_columns += 1;
238 used_width += min_width;
239 } else {
240 break;
241 }
242 }
243 n_used_columns
244 } else {
245 self.column_formats.len()
246 };
247 // let column_min_widths = column_min_widths.into_iter().take(n_used_columns);
248 // let column_max_widths = column_max_widths.into_iter().take(n_used_columns);
249
250 // compute used widths
251 let mut columns = Vec::with_capacity(n_used_columns);
252 let mut used_widths = Vec::with_capacity(n_used_columns);
253 let mut spare_room: usize = self.max_render_width -
254 column_min_widths.iter().take(n_used_columns).sum::<usize>() -
255 self.column_delimiter.chars().count() * ((n_used_columns as i64 - 1).max(0) as usize);
256 // println!("COLUMN_MIN_WIDTHS {:?}", column_min_widths);
257 // println!("TOTAL_MIN_WIDTHS {}",
258 // column_min_widths.iter().take(n_used_columns).sum::<usize>());
259 // println!("SPARE_ROOM {}", spare_room);
260 // println!("MAX_RENDER_WIDTH {:?}", self.max_render_width);
261 // println!(
262 // "MIN_TOTAL_W_DELIM {:?}",
263 // column_min_widths.iter().take(n_used_columns).sum::<usize>()
264 // + self.column_delimiter.chars().count()
265 // * ((n_used_columns as i64 - 1).max(0) as usize)
266 // );
267 // println!();
268
269 for (c, column_format) in self.column_formats.iter().take(n_used_columns).enumerate() {
270 if let (0, _) = df.shape() {
271 used_widths.push(column_min_widths[c]);
272 columns.push(vec![]);
273 continue
274 }
275
276 let min_width = column_min_widths[c];
277 let max_width = column_max_widths[c].min(min_width + spare_room);
278 let column = column_format
279 .clone()
280 .min_width(min_width)
281 .max_width(max_width)
282 .format(df.column(column_format.name.as_str())?)?;
283 let used_width = column
284 .iter()
285 .map(|s| s.chars().count())
286 // .map(|s| unicode_width::UnicodeWidthStr::width_cjk(s.as_str()))
287 // .map(|s| unicode_width::UnicodeWidthStr::width(s.as_str()))
288 .max()
289 .ok_or(FormatError::EmptyData(format!("empty column: {}", column_format.name)))?;
290 columns.push(column);
291 // println!("NAME {}", column_format.name);
292 // println!("FORMAT {:?}", column_format);
293 // println!("MAX_WIDTH {}", max_width);
294 // println!("MIN_WIDTH {}", min_width);
295 // println!("SPARE_ROOM {}", spare_room);
296 // println!("USED_WIDTH {}", used_width);
297 // println!("NEW_SPARE_ROOM {}", spare_room - (used_width - min_width));
298 // println!();
299 used_widths.push(used_width);
300 spare_room -= used_width - min_width;
301 }
302 Ok((used_widths, columns))
303 }
304
305 fn assemble_rows(&self, columns: Vec<Vec<String>>, rows: &mut Vec<String>, total_width: usize) {
306 let n_data_rows = match columns.first() {
307 Some(column) => column.len(),
308 None => return,
309 };
310 // println!("N_DATA_ROWS: {}", n_data_rows);
311 for r in 0..n_data_rows {
312 let mut row = String::with_capacity(total_width);
313 for (c, column) in columns.iter().enumerate() {
314 if c != 0 {
315 row.push_str(self.column_delimiter.as_str())
316 }
317 row.push_str(column[r].as_str())
318 }
319 rows.push(row)
320 }
321 }
322
323 pub(crate) fn format(&self, df: DataFrame) -> Result<String, FormatError> {
324 // clip
325 let n_data_rows = self.n_data_rows();
326 let df = df.clone().slice(0, n_data_rows);
327
328 // render columns
329 let (used_widths, columns) = self.render_columns(df)?;
330 let total_width = self.total_rendered_width(&used_widths);
331
332 // assemble rows
333 let mut rows = Vec::with_capacity(self.render_height);
334 if self.include_header_row {
335 for row in self.render_header_rows(&used_widths, total_width) {
336 rows.push(row);
337 }
338 if self.include_header_separator_row {
339 rows.push(self.render_header_separator_row(&used_widths, total_width));
340 }
341 };
342 self.assemble_rows(columns, &mut rows, total_width);
343 if self.include_summary_row {
344 todo!("summary row")
345 }
346
347 Ok(rows.join("\n"))
348 }
349}
350
351// // build header row
352// let n_rows = self.n_rows.unwrap_or_else(|| df.height().min(20));
353// let widths = determine_widths(&df, &columns)?;
354// let total_width = widths.iter().sum();
355// let mut header = String::with_capacity(total_width);
356// let column_delimiter = self.column_delimiter.clone().unwrap_or(" ".to_string());
357// for (i, (column, width)) in columns.iter().zip(widths).enumerate() {
358// header.push_str(format!("{:>width$}", column.display_name, width = width).as_str());
359// if i != columns.len() - 1 {
360// header.push_str(column_delimiter.as_str());
361// }
362// }
363
364// // convert numeric fields to float64
365// for (name, dtype) in df.schema().iter() {
366// if dtype.is_numeric() {
367// df = df
368// .clone()
369// .with_column(df.column(name)?.to_float()?)?
370// .clone();
371// }
372// }
373
374// // print each row
375// let mut rows = vec![];
376// rows.push(header);
377// for r in 0..n_rows {
378// let mut row = String::new();
379// for (c, column_format) in columns.iter().enumerate() {
380// let df = df.clone();
381// let column = df.column(column_format.name.as_str())?;
382// let cell = format_cell(column, column_format, r)?;
383// row.push_str(cell.as_str());
384// if c != columns.len() - 1 {
385// row.push_str(column_delimiter.as_str());
386// }
387// }
388// rows.push(row);
389// }
390
391// Ok(rows.join("\n"))
392
393// fn format_cell(
394// column: &Series,
395// column_format: &ColumnFormat,
396// r: usize,
397// ) -> Result<String, FormatError> {
398// match column.dtype() {
399// DataType::Binary => match column.binary()?.get(r) {
400// Some(binary) => Ok(column_format.binary_format()?.format(binary)?),
401// None => Ok("-".into()),
402// },
403// DataType::Utf8 => Ok(column.str_value(r)?.to_string()),
404// DataType::Float64 => match column.f64()?.get(r) {
405// Some(number) => Ok(column_format.number_format()?.format(number)?),
406// None => Ok("-".into()),
407// },
408// DataType::Boolean => match column.bool()?.get(r) {
409// Some(true) => Ok("yes".to_string()),
410// Some(false) => Ok("no".to_string()),
411// None => Ok("-".to_string()),
412// },
413// dtype => {
414// let message = format!("column {} has type {}", column.name(), dtype);
415// Err(FormatError::UnsupportedDatatype(message))
416// }
417// }
418// }
419
420// pub(crate) fn determine_widths(
421// df: &DataFrame,
422// columns: &Vec<ColumnFormat>,
423// ) -> Result<Vec<usize>, FormatError> {
424// let mut widths = Vec::with_capacity(columns.len());
425// for column in columns.iter() {
426// match column.min_width {
427// Some(min_width) => widths.push(min_width),
428// None => match df.schema().get(column.name.as_str()) {
429// Some(dtype) => widths.push(get_dtype_default_width(dtype)),
430// None => return Err(FormatError::ColumnMissing(column.name.to_string())),
431// },
432// }
433// }
434// Ok(widths)
435// }
436
437// pub(crate) fn get_dtype_default_width(dtype: &DataType) -> usize {
438// match dtype {
439// DataType::Boolean => 12,
440// DataType::UInt8 => 12,
441// DataType::UInt16 => 12,
442// DataType::UInt32 => 12,
443// DataType::UInt64 => 12,
444// DataType::Int8 => 12,
445// DataType::Int16 => 12,
446// DataType::Int32 => 12,
447// DataType::Int64 => 12,
448// DataType::Float32 => 12,
449// DataType::Float64 => 12,
450// // DataType::Decimal(_precision, _scale) => 12,
451// DataType::Utf8 => 12,
452// DataType::Binary => 12,
453// DataType::Date => 12,
454// DataType::Datetime(_, _) => 12,
455// DataType::Duration(_unit) => 12,
456// DataType::Time => 12,
457// // DataType::Array(_datatype, _size) => 12,
458// DataType::List(_datatype) => 12,
459// // DataType::Object(_) => 12,
460// DataType::Null => 12,
461// // DataType::Categorical(_) => 12,
462// DataType::Struct(_fields) => 12,
463// DataType::Unknown => 12,
464// }
465// }