use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
use std::sync::Arc;
use arrow2::datatypes::{DataType, Field};
use smallvec::SmallVec;
pub trait SizeBytes {
#[inline]
fn total_size_bytes(&self) -> u64 {
self.stack_size_bytes() + self.heap_size_bytes()
}
#[inline]
fn stack_size_bytes(&self) -> u64 {
std::mem::size_of_val(self) as _
}
fn heap_size_bytes(&self) -> u64;
#[inline]
fn is_pod() -> bool {
false
}
}
impl SizeBytes for String {
#[inline]
fn heap_size_bytes(&self) -> u64 {
self.as_bytes().len() as u64
}
}
impl<K: SizeBytes, V: SizeBytes> SizeBytes for BTreeMap<K, V> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
let keys_size_bytes = if K::is_pod() {
(self.len() * std::mem::size_of::<K>()) as _
} else {
self.keys().map(SizeBytes::total_size_bytes).sum::<u64>()
};
let values_size_bytes = if V::is_pod() {
(self.len() * std::mem::size_of::<V>()) as _
} else {
self.values().map(SizeBytes::total_size_bytes).sum::<u64>()
};
keys_size_bytes + values_size_bytes
}
}
impl<K: SizeBytes> SizeBytes for BTreeSet<K> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
if K::is_pod() {
(self.len() * std::mem::size_of::<K>()) as _
} else {
self.iter().map(SizeBytes::total_size_bytes).sum::<u64>()
}
}
}
impl<K: SizeBytes, V: SizeBytes, S> SizeBytes for HashMap<K, V, S> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
let keys_size_bytes = if K::is_pod() {
(self.len() * std::mem::size_of::<K>()) as _
} else {
self.keys().map(SizeBytes::total_size_bytes).sum::<u64>()
};
let values_size_bytes = if V::is_pod() {
(self.len() * std::mem::size_of::<V>()) as _
} else {
self.values().map(SizeBytes::total_size_bytes).sum::<u64>()
};
keys_size_bytes + values_size_bytes
}
}
impl<T: SizeBytes, const N: usize> SizeBytes for [T; N] {
#[inline]
fn heap_size_bytes(&self) -> u64 {
if T::is_pod() {
0 } else {
self.iter().map(SizeBytes::heap_size_bytes).sum::<u64>()
}
}
}
impl<T: SizeBytes> SizeBytes for Vec<T> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
if T::is_pod() {
(self.len() * std::mem::size_of::<T>()) as _
} else {
self.iter().map(SizeBytes::total_size_bytes).sum::<u64>()
}
}
}
impl<T: SizeBytes> SizeBytes for VecDeque<T> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
if T::is_pod() {
(self.len() * std::mem::size_of::<T>()) as _
} else {
self.iter().map(SizeBytes::total_size_bytes).sum::<u64>()
}
}
}
impl<T: SizeBytes, const N: usize> SizeBytes for SmallVec<[T; N]> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
if self.len() <= N {
if T::is_pod() {
0 } else {
self.iter().map(SizeBytes::heap_size_bytes).sum::<u64>()
}
} else {
if T::is_pod() {
(self.len() * std::mem::size_of::<T>()) as _
} else {
self.iter().map(SizeBytes::total_size_bytes).sum::<u64>()
}
}
}
}
impl<T: SizeBytes> SizeBytes for Option<T> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
self.as_ref().map_or(0, SizeBytes::heap_size_bytes)
}
}
impl<T: SizeBytes> SizeBytes for Arc<T> {
#[inline]
fn heap_size_bytes(&self) -> u64 {
0 }
}
macro_rules! impl_size_bytes_pod {
($ty:ty) => {
impl SizeBytes for $ty {
#[inline]
fn heap_size_bytes(&self) -> u64 {
0
}
#[inline]
fn is_pod() -> bool {
true
}
}
};
($ty:ty, $($rest:ty),+) => {
impl_size_bytes_pod!($ty); impl_size_bytes_pod!($($rest),+);
};
}
impl_size_bytes_pod!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128, bool, f32, f64);
impl_size_bytes_pod!(arrow2::types::f16);
impl<T, U> SizeBytes for (T, U)
where
T: SizeBytes,
U: SizeBytes,
{
#[inline]
fn heap_size_bytes(&self) -> u64 {
let (a, b) = self;
a.heap_size_bytes() + b.heap_size_bytes()
}
#[inline]
fn is_pod() -> bool {
T::is_pod() && U::is_pod()
}
}
impl<T, U, V> SizeBytes for (T, U, V)
where
T: SizeBytes,
U: SizeBytes,
V: SizeBytes,
{
#[inline]
fn heap_size_bytes(&self) -> u64 {
let (a, b, c) = self;
a.heap_size_bytes() + b.heap_size_bytes() + c.heap_size_bytes()
}
#[inline]
fn is_pod() -> bool {
T::is_pod() && U::is_pod() && V::is_pod()
}
}
impl<T, U, V, W> SizeBytes for (T, U, V, W)
where
T: SizeBytes,
U: SizeBytes,
V: SizeBytes,
W: SizeBytes,
{
#[inline]
fn heap_size_bytes(&self) -> u64 {
let (a, b, c, d) = self;
a.heap_size_bytes() + b.heap_size_bytes() + c.heap_size_bytes() + d.heap_size_bytes()
}
#[inline]
fn is_pod() -> bool {
T::is_pod() && U::is_pod() && V::is_pod() && W::is_pod()
}
}
impl SizeBytes for DataType {
#[inline]
fn heap_size_bytes(&self) -> u64 {
match self {
DataType::Null
| DataType::Binary
| DataType::Boolean
| DataType::Date32
| DataType::Date64
| DataType::Float16
| DataType::Float32
| DataType::Float64
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::Int8
| DataType::LargeBinary
| DataType::LargeUtf8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::UInt8
| DataType::Time32(_)
| DataType::Time64(_)
| DataType::Duration(_)
| DataType::Interval(_)
| DataType::FixedSizeBinary(_)
| DataType::Decimal(_, _)
| DataType::Decimal256(_, _)
| DataType::Utf8 => 0,
DataType::Timestamp(_, str) => str.heap_size_bytes(),
DataType::List(field)
| DataType::FixedSizeList(field, _)
| DataType::LargeList(field)
| DataType::Map(field, _) => field.total_size_bytes(), DataType::Struct(fields) => fields.heap_size_bytes(),
DataType::Union(fields, indices, _) => {
fields.heap_size_bytes() + indices.heap_size_bytes()
}
DataType::Dictionary(_, datatype, _) => datatype.total_size_bytes(), DataType::Extension(name, datatype, extra) => {
name.heap_size_bytes()
+ datatype.total_size_bytes() + extra.heap_size_bytes()
}
}
}
}
impl SizeBytes for Field {
#[inline]
fn heap_size_bytes(&self) -> u64 {
let Field {
name,
data_type,
is_nullable,
metadata,
} = self;
name.heap_size_bytes()
+ data_type.heap_size_bytes()
+ is_nullable.heap_size_bytes()
+ metadata.heap_size_bytes()
}
}
impl SizeBytes for dyn Array {
fn heap_size_bytes(&self) -> u64 {
estimated_bytes_size(self) as _
}
}
use arrow2::array::{
Array, BinaryArray, BooleanArray, DictionaryArray, FixedSizeBinaryArray, FixedSizeListArray,
ListArray, MapArray, PrimitiveArray, StructArray, UnionArray, Utf8Array,
};
use arrow2::bitmap::Bitmap;
use arrow2::datatypes::PhysicalType;
macro_rules! with_match_primitive_type {(
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
) => ({
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
use arrow2::datatypes::PrimitiveType::*;
use arrow2::types::{days_ms, months_days_ns, f16, i256};
match $key_type {
Int8 => __with_ty__! { i8 },
Int16 => __with_ty__! { i16 },
Int32 => __with_ty__! { i32 },
Int64 => __with_ty__! { i64 },
Int128 => __with_ty__! { i128 },
Int256 => __with_ty__! { i256 },
DaysMs => __with_ty__! { days_ms },
MonthDayNano => __with_ty__! { months_days_ns },
UInt8 => __with_ty__! { u8 },
UInt16 => __with_ty__! { u16 },
UInt32 => __with_ty__! { u32 },
UInt64 => __with_ty__! { u64 },
Float16 => __with_ty__! { f16 },
Float32 => __with_ty__! { f32 },
Float64 => __with_ty__! { f64 },
}
})}
macro_rules! match_integer_type {(
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
) => ({
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
use arrow2::datatypes::IntegerType::*;
match $key_type {
Int8 => __with_ty__! { i8 },
Int16 => __with_ty__! { i16 },
Int32 => __with_ty__! { i32 },
Int64 => __with_ty__! { i64 },
UInt8 => __with_ty__! { u8 },
UInt16 => __with_ty__! { u16 },
UInt32 => __with_ty__! { u32 },
UInt64 => __with_ty__! { u64 },
}
})}
macro_rules! dyn_binary {
($array:expr, $ty:ty, $o:ty) => {{
let array = $array.as_any().downcast_ref::<$ty>().unwrap();
let offsets = array.offsets().buffer();
let values_start = offsets[0] as usize;
let values_end = offsets[offsets.len() - 1] as usize;
values_end - values_start
+ offsets.len() * std::mem::size_of::<$o>()
+ validity_size(array.validity())
}};
}
fn validity_size(validity: Option<&Bitmap>) -> usize {
validity.as_ref().map_or(0, |b| b.as_slice().0.len())
}
fn estimated_bytes_size(array: &dyn Array) -> usize {
#[allow(clippy::enum_glob_use)]
use PhysicalType::*;
match array.data_type().to_physical_type() {
Null => 0,
Boolean => {
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
array.values().as_slice().0.len() + validity_size(array.validity())
}
Primitive(primitive) => with_match_primitive_type!(primitive, |$T| {
let array = array
.as_any()
.downcast_ref::<PrimitiveArray<$T>>()
.unwrap();
array.values().len() * std::mem::size_of::<$T>() + validity_size(array.validity())
}),
Binary => dyn_binary!(array, BinaryArray<i32>, i32),
FixedSizeBinary => {
let array = array
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
array.values().len() + validity_size(array.validity())
}
LargeBinary => dyn_binary!(array, BinaryArray<i64>, i64),
Utf8 => dyn_binary!(array, Utf8Array<i32>, i32),
LargeUtf8 => dyn_binary!(array, Utf8Array<i64>, i64),
List | LargeList => {
let array = array.as_any().downcast_ref::<ListArray<i32>>().unwrap();
let offsets = array.offsets().buffer();
let values_start = offsets[0] as usize;
let values_end = offsets[offsets.len() - 1] as usize;
estimated_bytes_size(
array
.values()
.sliced(values_start, values_end - values_start)
.as_ref(),
) + std::mem::size_of_val(array.offsets().as_slice())
+ validity_size(array.validity())
}
FixedSizeList => {
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
estimated_bytes_size(array.values().as_ref()) + validity_size(array.validity())
}
Struct => {
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
array
.values()
.iter()
.map(|x| x.as_ref())
.map(estimated_bytes_size)
.sum::<usize>()
+ validity_size(array.validity())
}
Union => {
let array = array.as_any().downcast_ref::<UnionArray>().unwrap();
let types_size = array.types().len() * std::mem::size_of::<i8>();
if let Some(offsets) = array.offsets() {
#[derive(Debug)]
struct Range {
min: i32,
max: i32,
}
let mut type_ranges: BTreeMap<i8, Range> = Default::default();
debug_assert_eq!(array.types().len(), offsets.len());
for (&type_id, &offset) in array.types().iter().zip(offsets.iter()) {
type_ranges
.entry(type_id)
.and_modify(|range| {
range.max = offset;
})
.or_insert(Range {
min: offset,
max: offset,
});
}
let mut fields_size = 0;
for (type_id, range) in type_ranges {
if let Some(field) = array.fields().get(type_id as usize) {
let len = range.max - range.min + 1; fields_size += estimated_bytes_size(
field.sliced(range.min as usize, len as usize).as_ref(),
);
}
}
let offsets_size = offsets.len() * std::mem::size_of::<i32>();
types_size + offsets_size + fields_size
} else {
let num_elems = array.types().len();
let fields_size = array
.fields()
.iter()
.map(|x| estimated_bytes_size(x.sliced(0, num_elems.min(x.len())).as_ref()))
.sum::<usize>();
types_size + fields_size
}
}
Dictionary(key_type) => match_integer_type!(key_type, |$T| {
let array = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap();
estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref())
}),
Map => {
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
let offsets = array.offsets().len() * std::mem::size_of::<i32>();
offsets + estimated_bytes_size(array.field().as_ref()) + validity_size(array.validity())
}
}
}
#[test]
#[allow(clippy::from_iter_instead_of_collect)]
fn test_arrow_estimated_size_bytes() {
use arrow2::{
array::{Array, Float64Array, ListArray, StructArray, UInt64Array, Utf8Array},
buffer::Buffer,
datatypes::{DataType, Field, UnionMode},
offset::Offsets,
};
use std::mem::size_of;
{
let data = vec![];
let array = UInt64Array::from_vec(data.clone()).boxed();
let sz = estimated_bytes_size(&*array);
assert_eq!(0, sz);
assert_eq!(std::mem::size_of_val(data.as_slice()), sz);
}
{
let data = vec![42u64; 100];
let array = UInt64Array::from_vec(data.clone()).boxed();
assert_eq!(
std::mem::size_of_val(data.as_slice()),
estimated_bytes_size(&*array)
);
}
{
let data = vec![Some("some very, very, very long string indeed"); 100];
let array = Utf8Array::<i32>::from(data.clone()).to_boxed();
let raw_size_bytes = data
.iter()
.map(|s| std::mem::size_of_val(s) + std::mem::size_of_val(s.unwrap().as_bytes()))
.sum::<usize>();
let arrow_size_bytes = estimated_bytes_size(&*array);
assert_eq!(5600, raw_size_bytes);
assert_eq!(4404, arrow_size_bytes); }
{
let data = std::iter::repeat(vec![42u64; 100])
.take(50)
.collect::<Vec<_>>();
let array = {
let array_flattened =
UInt64Array::from_vec(data.clone().into_iter().flatten().collect()).boxed();
ListArray::<i32>::new(
ListArray::<i32>::default_datatype(DataType::UInt64),
Offsets::try_from_lengths(std::iter::repeat(100).take(50))
.unwrap()
.into(),
array_flattened,
None,
)
.boxed()
};
let raw_size_bytes = data
.iter()
.map(|s| std::mem::size_of_val(s) + std::mem::size_of_val(s.as_slice()))
.sum::<usize>();
let arrow_size_bytes = estimated_bytes_size(&*array);
assert_eq!(41200, raw_size_bytes);
assert_eq!(40204, arrow_size_bytes); }
{
#[derive(Clone, Copy)]
struct Point {
x: f64,
y: f64,
}
impl Default for Point {
fn default() -> Self {
Self { x: 42.0, y: 666.0 }
}
}
let data = vec![Point::default(); 100];
let array = {
let x = Float64Array::from_vec(data.iter().map(|p| p.x).collect()).boxed();
let y = Float64Array::from_vec(data.iter().map(|p| p.y).collect()).boxed();
let fields = vec![
Field::new("x", DataType::Float64, false),
Field::new("y", DataType::Float64, false),
];
StructArray::new(DataType::Struct(Arc::new(fields)), vec![x, y], None).boxed()
};
let raw_size_bytes = std::mem::size_of_val(data.as_slice());
let arrow_size_bytes = estimated_bytes_size(&*array);
assert_eq!(1600, raw_size_bytes);
assert_eq!(1600, arrow_size_bytes);
}
{
#[derive(Clone, Copy)]
struct Point {
x: f64,
y: f64,
}
impl Default for Point {
fn default() -> Self {
Self { x: 42.0, y: 666.0 }
}
}
let data = std::iter::repeat(vec![Point::default(); 100])
.take(50)
.collect::<Vec<_>>();
let array: Box<dyn Array> = {
let array = {
let x =
Float64Array::from_vec(data.iter().flatten().map(|p| p.x).collect()).boxed();
let y =
Float64Array::from_vec(data.iter().flatten().map(|p| p.y).collect()).boxed();
let fields = vec![
Field::new("x", DataType::Float64, false),
Field::new("y", DataType::Float64, false),
];
StructArray::new(DataType::Struct(Arc::new(fields)), vec![x, y], None)
};
ListArray::<i32>::new(
ListArray::<i32>::default_datatype(array.data_type().clone()),
Offsets::try_from_lengths(std::iter::repeat(100).take(50))
.unwrap()
.into(),
array.boxed(),
None,
)
.boxed()
};
let raw_size_bytes = data
.iter()
.map(|s| std::mem::size_of_val(s) + std::mem::size_of_val(s.as_slice()))
.sum::<usize>();
let arrow_size_bytes = estimated_bytes_size(&*array);
assert_eq!(81200, raw_size_bytes);
assert_eq!(80204, arrow_size_bytes); }
{
let fields = vec![
Field::new("i", DataType::Int32, false),
Field::new("f", DataType::Float64, false),
];
let data_type = DataType::Union(
Arc::new(fields),
Some(Arc::new(vec![0i32, 1i32])),
UnionMode::Dense,
);
let types = Buffer::<i8>::from(vec![0i8, 0i8, 1i8, 0i8, 1i8]);
let fields = vec![
PrimitiveArray::<i32>::from_vec(vec![0, 1, 2]).boxed(),
PrimitiveArray::<f64>::from_vec(vec![0.0, 1.0]).boxed(),
];
let offsets = vec![0, 1, 0, 2, 1];
let array = UnionArray::new(data_type, types, fields, Some(offsets.into())).boxed();
let raw_size_bytes = 5 + 3 * size_of::<i32>() + 2 * size_of::<f64>() + 5 * size_of::<i32>();
let arrow_size_bytes = estimated_bytes_size(&*array);
assert_eq!(raw_size_bytes, arrow_size_bytes);
}
}