re_format_arrow/
lib.rs

1//! Formatting for tables of Arrow arrays
2
3use std::fmt::Formatter;
4
5use arrow::{
6    array::{Array, ArrayRef, ListArray},
7    datatypes::{DataType, Field, Fields},
8    util::display::{ArrayFormatter, FormatOptions},
9};
10use comfy_table::{Cell, Row, Table, presets};
11use itertools::{Either, Itertools as _};
12
13use re_arrow_util::{ArrowArrayDowncastRef as _, format_data_type};
14use re_tuid::Tuid;
15use re_types_core::Loggable as _;
16
17// ---
18
19// TODO(#1775): Registering custom formatters should be done from other crates:
20// A) Because `re_format` cannot depend on other crates (cyclic deps)
21// B) Because how to deserialize and inspect some type is a private implementation detail of that
22//    type, re_format shouldn't know how to deserialize a TUID…
23
24/// Format the given row as a string
25type CustomArrayFormatter<'a> = Box<dyn Fn(usize) -> Result<String, String> + 'a>;
26
27/// This is a `BTreeMap`, and not a `HashMap`, because we want a predictable order.
28type Metadata = std::collections::BTreeMap<String, String>;
29
30fn custom_array_formatter<'a>(field: &Field, array: &'a dyn Array) -> CustomArrayFormatter<'a> {
31    if let Some(extension_name) = field.metadata().get("ARROW:extension:name") {
32        // TODO(#1775): This should be registered dynamically.
33        if extension_name.as_str() == Tuid::ARROW_EXTENSION_NAME {
34            // For example: `RowId` is a TUID that should be formatted with a `row_` prefix:
35            let prefix = field
36                .metadata()
37                .get("ARROW:extension:metadata")
38                .and_then(|metadata| serde_json::from_str::<Metadata>(metadata).ok())
39                .and_then(|metadata| {
40                    metadata
41                        .get("namespace")
42                        .map(|namespace| format!("{namespace}_"))
43                })
44                .unwrap_or_default();
45
46            return Box::new(move |index| {
47                if let Some(tuid) = parse_tuid(array, index) {
48                    Ok(format!("{prefix}{tuid}"))
49                } else {
50                    Err("Invalid RowId".to_owned())
51                }
52            });
53        }
54    }
55
56    match ArrayFormatter::try_new(array, &FormatOptions::default().with_null("null")) {
57        Ok(formatter) => Box::new(move |index| Ok(format!("{}", formatter.value(index)))),
58        Err(err) => Box::new(move |_| Err(format!("Failed to format array: {err}"))),
59    }
60}
61
62// TODO(#1775): This should be defined and registered by the `re_tuid` crate.
63fn parse_tuid(array: &dyn Array, index: usize) -> Option<Tuid> {
64    fn parse_inner(array: &dyn Array, index: usize) -> Option<Tuid> {
65        let tuids = Tuid::from_arrow(array).ok()?;
66        tuids.get(index).copied()
67    }
68
69    match array.data_type() {
70        // Legacy MsgId lists: just grab the first value, they're all identical
71        DataType::List(_) => parse_inner(&array.downcast_array_ref::<ListArray>()?.value(index), 0),
72        // New control columns: it's not a list to begin with!
73        _ => parse_inner(array, index),
74    }
75}
76
77// ---
78
79struct DisplayMetadata {
80    prefix: &'static str,
81    metadata: Metadata,
82}
83
84impl std::fmt::Display for DisplayMetadata {
85    #[inline]
86    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
87        let Self { prefix, metadata } = self;
88        f.write_str(
89            &metadata
90                .iter()
91                .map(|(key, value)| format!("{prefix}{}: {}", trim_name(key), trim_name(value)))
92                .collect_vec()
93                .join("\n"),
94        )
95    }
96}
97
98fn trim_name(name: &str) -> &str {
99    name.trim()
100        .trim_start_matches("rerun.archetypes.")
101        .trim_start_matches("rerun.components.")
102        .trim_start_matches("rerun.datatypes.")
103        .trim_start_matches("rerun.controls.")
104        .trim_start_matches("rerun.blueprint.archetypes.")
105        .trim_start_matches("rerun.blueprint.components.")
106        .trim_start_matches("rerun.blueprint.datatypes.")
107        .trim_start_matches("rerun.field.")
108        .trim_start_matches("rerun.chunk.")
109        .trim_start_matches("rerun.")
110}
111
112#[derive(Clone, Debug)]
113pub struct RecordBatchFormatOpts {
114    /// If `true`, the dataframe will be transposed on its diagonal axis.
115    ///
116    /// This is particularly useful for wide (i.e. lots of columns), short (i.e. not many rows) datasets.
117    ///
118    /// Setting this to `true` will also disable all per-column metadata (`include_column_metadata=false`).
119    pub transposed: bool,
120
121    /// If specified, displays the dataframe with the given fixed width.
122    ///
123    /// Defaults to the terminal width if left unspecified.
124    pub width: Option<usize>,
125
126    /// If `true`, displays the dataframe's metadata too.
127    pub include_metadata: bool,
128
129    /// If `true`, displays the individual columns' metadata too.
130    pub include_column_metadata: bool,
131}
132
133impl Default for RecordBatchFormatOpts {
134    fn default() -> Self {
135        Self {
136            transposed: false,
137            width: None,
138            include_metadata: true,
139            include_column_metadata: true,
140        }
141    }
142}
143
144/// Nicely format this record batch in a way that fits the terminal.
145pub fn format_record_batch(batch: &arrow::array::RecordBatch) -> Table {
146    format_record_batch_with_width(batch, None)
147}
148
149/// Nicely format this record batch using the specified options.
150pub fn format_record_batch_opts(
151    batch: &arrow::array::RecordBatch,
152    opts: &RecordBatchFormatOpts,
153) -> Table {
154    format_dataframe_with_metadata(
155        &batch.schema_ref().metadata.clone().into_iter().collect(), // HashMap -> BTreeMap
156        &batch.schema_ref().fields,
157        batch.columns(),
158        opts,
159    )
160}
161
162/// Nicely format this record batch, either with the given fixed width, or with the terminal width (`None`).
163///
164/// If `transposed` is `true`, the dataframe will be printed transposed on its diagonal axis.
165/// This is very useful for wide (i.e. lots of columns), short (i.e. not many rows) datasets.
166pub fn format_record_batch_with_width(
167    batch: &arrow::array::RecordBatch,
168    width: Option<usize>,
169) -> Table {
170    format_dataframe_with_metadata(
171        &batch.schema_ref().metadata.clone().into_iter().collect(), // HashMap -> BTreeMap
172        &batch.schema_ref().fields,
173        batch.columns(),
174        &RecordBatchFormatOpts {
175            transposed: false,
176            width,
177            include_metadata: true,
178            include_column_metadata: true,
179        },
180    )
181}
182
183fn format_dataframe_with_metadata(
184    metadata: &Metadata,
185    fields: &Fields,
186    columns: &[ArrayRef],
187    opts: &RecordBatchFormatOpts,
188) -> Table {
189    let &RecordBatchFormatOpts {
190        transposed: _,
191        width,
192        include_metadata,
193        include_column_metadata: _,
194    } = opts;
195
196    let (num_columns, table) = format_dataframe_without_metadata(fields, columns, opts);
197
198    if include_metadata && !metadata.is_empty() {
199        let mut outer_table = Table::new();
200        outer_table.load_preset(presets::UTF8_FULL);
201
202        if let Some(width) = width {
203            outer_table.set_width(width as _);
204            outer_table.set_content_arrangement(comfy_table::ContentArrangement::Disabled);
205        } else {
206            outer_table.set_content_arrangement(comfy_table::ContentArrangement::Dynamic);
207        }
208
209        outer_table.add_row({
210            let mut row = Row::new();
211            row.add_cell(Cell::new(format!(
212                "METADATA:\n{}",
213                DisplayMetadata {
214                    prefix: "* ",
215                    metadata: metadata.clone()
216                }
217            )));
218            row
219        });
220
221        outer_table.add_row(vec![table.trim_fmt()]);
222        outer_table.set_content_arrangement(comfy_table::ContentArrangement::Dynamic);
223        outer_table.set_constraints(
224            std::iter::repeat(comfy_table::ColumnConstraint::ContentWidth).take(num_columns),
225        );
226        outer_table
227    } else {
228        table
229    }
230}
231
232fn format_dataframe_without_metadata(
233    fields: &Fields,
234    columns: &[ArrayRef],
235    opts: &RecordBatchFormatOpts,
236) -> (usize, Table) {
237    let &RecordBatchFormatOpts {
238        transposed,
239        width,
240        include_metadata: _,
241        include_column_metadata,
242    } = opts;
243
244    let mut table = Table::new();
245    table.load_preset(presets::UTF8_FULL);
246
247    if let Some(width) = width {
248        table.set_width(width as _);
249        table.set_content_arrangement(comfy_table::ContentArrangement::Disabled);
250    } else {
251        table.set_content_arrangement(comfy_table::ContentArrangement::Dynamic);
252    }
253
254    let formatters = itertools::izip!(fields.iter(), columns.iter())
255        .map(|(field, array)| custom_array_formatter(field, &**array))
256        .collect_vec();
257
258    let num_columns = if transposed {
259        // Turns:
260        // ```
261        // resource_id     manifest_url
262        // -----------     --------------
263        // resource_1      resource_1_url
264        // resource_2      resource_2_url
265        // resource_3      resource_3_url
266        // resource_4      resource_4_url
267        // ```
268        // into:
269        // ```
270        // resource_id       resource_1         resource_2         resource_3         resource_4
271        // manifest_url      resource_1_url     resource_2_url     resource_3_url     resource_4_url
272        // ```
273
274        let mut headers = fields
275            .iter()
276            .map(|field| Cell::new(trim_name(field.name())))
277            .collect_vec();
278        headers.reverse();
279
280        let mut columns = columns.to_vec();
281        columns.reverse();
282
283        for formatter in formatters {
284            let mut cells = headers.pop().into_iter().collect_vec();
285
286            let Some(col) = columns.pop() else {
287                break;
288            };
289
290            for i in 0..col.len() {
291                let cell = match formatter(i) {
292                    Ok(string) => format_cell(string),
293                    Err(err) => Cell::new(err),
294                };
295                cells.push(cell);
296            }
297
298            table.add_row(cells);
299        }
300
301        columns.first().map_or(0, |list_array| list_array.len())
302    } else {
303        let header = if include_column_metadata {
304            Either::Left(fields.iter().map(|field| {
305                if field.metadata().is_empty() {
306                    Cell::new(format!(
307                        "{}\n---\ntype: {}",
308                        trim_name(field.name()),
309                        format_data_type(field.data_type()),
310                    ))
311                } else {
312                    Cell::new(format!(
313                        "{}\n---\ntype: {}\n{}",
314                        trim_name(field.name()),
315                        format_data_type(field.data_type()),
316                        DisplayMetadata {
317                            prefix: "",
318                            metadata: field.metadata().clone().into_iter().collect()
319                        },
320                    ))
321                }
322            }))
323        } else {
324            Either::Right(
325                fields
326                    .iter()
327                    .map(|field| Cell::new(trim_name(field.name()).to_owned())),
328            )
329        };
330
331        table.set_header(header);
332
333        let num_rows = columns.first().map_or(0, |list_array| list_array.len());
334
335        for row in 0..num_rows {
336            let cells: Vec<_> = formatters
337                .iter()
338                .map(|formatter| match formatter(row) {
339                    Ok(string) => format_cell(string),
340                    Err(err) => Cell::new(err),
341                })
342                .collect();
343            table.add_row(cells);
344        }
345
346        columns.len()
347    };
348
349    table.set_content_arrangement(comfy_table::ContentArrangement::Dynamic);
350    // NOTE: `Percentage` only works for terminals that report their sizes.
351    if table.width().is_some() {
352        let percentage = comfy_table::Width::Percentage((100.0 / num_columns as f32) as u16);
353        table.set_constraints(
354            std::iter::repeat(comfy_table::ColumnConstraint::UpperBoundary(percentage))
355                .take(num_columns),
356        );
357    }
358
359    (num_columns, table)
360}
361
362fn format_cell(string: String) -> Cell {
363    const MAXIMUM_CELL_CONTENT_WIDTH: u16 = 100;
364
365    let chars: Vec<_> = string.chars().collect();
366    if chars.len() > MAXIMUM_CELL_CONTENT_WIDTH as usize {
367        Cell::new(
368            chars
369                .into_iter()
370                .take(MAXIMUM_CELL_CONTENT_WIDTH.saturating_sub(1).into())
371                .chain(['…'])
372                .collect::<String>(),
373        )
374    } else {
375        Cell::new(string)
376    }
377}