1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
use parquet2::{
encoding::{hybrid_rle::bitpacked_encode, Encoding},
metadata::ColumnDescriptor,
page::DataPage,
statistics::{serialize_statistics, BooleanStatistics, ParquetStatistics, Statistics},
write::WriteOptions,
};
use super::super::utils;
use crate::error::Result;
use crate::{array::*, io::parquet::read::is_type_nullable};
fn encode(iterator: impl Iterator<Item = bool>, buffer: &mut Vec<u8>) -> Result<()> {
let len = buffer.len();
let mut buffer = std::io::Cursor::new(buffer);
buffer.set_position(len as u64);
Ok(bitpacked_encode(&mut buffer, iterator)?)
}
pub(super) fn encode_plain(
array: &BooleanArray,
is_optional: bool,
buffer: &mut Vec<u8>,
) -> Result<()> {
if is_optional {
let iter = array.iter().flatten().take(
array
.validity()
.as_ref()
.map(|x| x.len() - x.null_count())
.unwrap_or_else(|| array.len()),
);
encode(iter, buffer)
} else {
let iter = array.values().iter();
encode(iter, buffer)
}
}
pub fn array_to_page(
array: &BooleanArray,
options: WriteOptions,
descriptor: ColumnDescriptor,
) -> Result<DataPage> {
let is_optional = is_type_nullable(descriptor.type_());
let validity = array.validity();
let mut buffer = vec![];
utils::write_def_levels(
&mut buffer,
is_optional,
validity,
array.len(),
options.version,
)?;
let definition_levels_byte_length = buffer.len();
encode_plain(array, is_optional, &mut buffer)?;
let statistics = if options.write_statistics {
Some(build_statistics(array))
} else {
None
};
utils::build_plain_page(
buffer,
array.len(),
array.null_count(),
0,
definition_levels_byte_length,
statistics,
descriptor,
options,
Encoding::Plain,
)
}
pub(super) fn build_statistics(array: &BooleanArray) -> ParquetStatistics {
let statistics = &BooleanStatistics {
null_count: Some(array.null_count() as i64),
distinct_count: None,
max_value: array.iter().flatten().max(),
min_value: array.iter().flatten().min(),
} as &dyn Statistics;
serialize_statistics(statistics)
}