parquet2/metadata/
row_metadata.rs

1use parquet_format_safe::RowGroup;
2
3use super::{column_chunk_metadata::ColumnChunkMetaData, schema_descriptor::SchemaDescriptor};
4use crate::{
5    error::{Error, Result},
6    write::ColumnOffsetsMetadata,
7};
8#[cfg(feature = "serde_types")]
9use serde::{Deserialize, Serialize};
10
11/// Metadata for a row group.
12#[derive(Debug, Clone)]
13#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
14pub struct RowGroupMetaData {
15    columns: Vec<ColumnChunkMetaData>,
16    num_rows: usize,
17    total_byte_size: usize,
18}
19
20impl RowGroupMetaData {
21    /// Create a new [`RowGroupMetaData`]
22    pub fn new(
23        columns: Vec<ColumnChunkMetaData>,
24        num_rows: usize,
25        total_byte_size: usize,
26    ) -> RowGroupMetaData {
27        Self {
28            columns,
29            num_rows,
30            total_byte_size,
31        }
32    }
33
34    /// Returns slice of column chunk metadata.
35    pub fn columns(&self) -> &[ColumnChunkMetaData] {
36        &self.columns
37    }
38
39    /// Number of rows in this row group.
40    pub fn num_rows(&self) -> usize {
41        self.num_rows
42    }
43
44    /// Total byte size of all uncompressed column data in this row group.
45    pub fn total_byte_size(&self) -> usize {
46        self.total_byte_size
47    }
48
49    /// Total size of all compressed column data in this row group.
50    pub fn compressed_size(&self) -> usize {
51        self.columns
52            .iter()
53            .map(|c| c.compressed_size() as usize)
54            .sum::<usize>()
55    }
56
57    /// Method to convert from Thrift.
58    pub(crate) fn try_from_thrift(
59        schema_descr: &SchemaDescriptor,
60        rg: RowGroup,
61    ) -> Result<RowGroupMetaData> {
62        if schema_descr.columns().len() != rg.columns.len() {
63            return Err(Error::oos(format!("The number of columns in the row group ({}) must be equal to the number of columns in the schema ({})", rg.columns.len(), schema_descr.columns().len())));
64        }
65        let total_byte_size = rg.total_byte_size.try_into()?;
66        let num_rows = rg.num_rows.try_into()?;
67        let columns = rg
68            .columns
69            .into_iter()
70            .zip(schema_descr.columns())
71            .map(|(column_chunk, descriptor)| {
72                ColumnChunkMetaData::try_from_thrift(descriptor.clone(), column_chunk)
73            })
74            .collect::<Result<Vec<_>>>()?;
75
76        Ok(RowGroupMetaData {
77            columns,
78            num_rows,
79            total_byte_size,
80        })
81    }
82
83    /// Method to convert to Thrift.
84    pub(crate) fn into_thrift(self) -> RowGroup {
85        let file_offset = self
86            .columns
87            .iter()
88            .map(|c| {
89                ColumnOffsetsMetadata::from_column_chunk_metadata(c).calc_row_group_file_offset()
90            })
91            .next()
92            .unwrap_or(None);
93        let total_compressed_size = Some(self.compressed_size() as i64);
94        RowGroup {
95            columns: self.columns.into_iter().map(|v| v.into_thrift()).collect(),
96            total_byte_size: self.total_byte_size as i64,
97            num_rows: self.num_rows as i64,
98            sorting_columns: None,
99            file_offset,
100            total_compressed_size,
101            ordinal: None,
102        }
103    }
104}