parquet 56.2.1

Apache Parquet implementation in Rust
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::arrow::record_reader::buffer::ValuesBuffer;
use arrow_array::{builder::make_view, make_array, ArrayRef};
use arrow_buffer::Buffer;
use arrow_data::ArrayDataBuilder;
use arrow_schema::DataType as ArrowType;

/// A buffer of view type byte arrays that can be converted into
/// `GenericByteViewArray`
///
/// Note this does not reuse `GenericByteViewBuilder` due to the need to call `pad_nulls`
/// and reuse the existing logic for Vec in the parquet crate
#[derive(Debug, Default)]
pub struct ViewBuffer {
    pub views: Vec<u128>,
    pub buffers: Vec<Buffer>,
}

impl ViewBuffer {
    pub fn is_empty(&self) -> bool {
        self.views.is_empty()
    }

    pub fn append_block(&mut self, block: Buffer) -> u32 {
        let block_id = self.buffers.len() as u32;
        self.buffers.push(block);
        block_id
    }

    /// # Safety
    /// This method is only safe when:
    /// - `block` is a valid index, i.e., the return value of `append_block`
    /// - `offset` and `offset + len` are valid indices into the buffer
    /// - The `(offset, offset + len)` is valid value for the native type.
    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
        let b = self.buffers.get_unchecked(block as usize);
        let end = offset.saturating_add(len);
        let b = b.get_unchecked(offset as usize..end as usize);

        let view = make_view(b, block, offset);

        self.views.push(view);
    }

    /// Directly append a view to the view array.
    /// This is used when we create a StringViewArray from a dictionary whose values are StringViewArray.
    ///
    /// # Safety
    /// The `view` must be a valid view as per the ByteView spec.
    pub unsafe fn append_raw_view_unchecked(&mut self, view: &u128) {
        self.views.push(*view);
    }

    /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
    pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
        let len = self.views.len();
        let views = Buffer::from_vec(self.views);
        match data_type {
            ArrowType::Utf8View => {
                let builder = ArrayDataBuilder::new(ArrowType::Utf8View)
                    .len(len)
                    .add_buffer(views)
                    .add_buffers(self.buffers)
                    .null_bit_buffer(null_buffer);
                // We have checked that the data is utf8 when building the buffer, so it is safe
                let array = unsafe { builder.build_unchecked() };
                make_array(array)
            }
            ArrowType::BinaryView => {
                let builder = ArrayDataBuilder::new(ArrowType::BinaryView)
                    .len(len)
                    .add_buffer(views)
                    .add_buffers(self.buffers)
                    .null_bit_buffer(null_buffer);
                let array = unsafe { builder.build_unchecked() };
                make_array(array)
            }
            _ => panic!("Unsupported data type: {data_type:?}"),
        }
    }
}

impl ValuesBuffer for ViewBuffer {
    fn pad_nulls(
        &mut self,
        read_offset: usize,
        values_read: usize,
        levels_read: usize,
        valid_mask: &[u8],
    ) {
        self.views
            .pad_nulls(read_offset, values_read, levels_read, valid_mask);
    }
}

#[cfg(test)]
mod tests {

    use arrow_array::Array;

    use super::*;

    #[test]
    fn test_view_buffer_empty() {
        let buffer = ViewBuffer::default();
        let array = buffer.into_array(None, &ArrowType::Utf8View);
        let strings = array
            .as_any()
            .downcast_ref::<arrow::array::StringViewArray>()
            .unwrap();
        assert_eq!(strings.len(), 0);
    }

    #[test]
    fn test_view_buffer_append_view() {
        let mut buffer = ViewBuffer::default();
        let string_buffer = Buffer::from(b"0123456789long string to test string view");
        let block_id = buffer.append_block(string_buffer);

        unsafe {
            buffer.append_view_unchecked(block_id, 0, 1);
            buffer.append_view_unchecked(block_id, 1, 9);
            buffer.append_view_unchecked(block_id, 10, 31);
        }

        let array = buffer.into_array(None, &ArrowType::Utf8View);
        let string_array = array
            .as_any()
            .downcast_ref::<arrow::array::StringViewArray>()
            .unwrap();
        assert_eq!(
            string_array.iter().collect::<Vec<_>>(),
            vec![
                Some("0"),
                Some("123456789"),
                Some("long string to test string view"),
            ]
        );
    }

    #[test]
    fn test_view_buffer_pad_null() {
        let mut buffer = ViewBuffer::default();
        let string_buffer = Buffer::from(b"0123456789long string to test string view");
        let block_id = buffer.append_block(string_buffer);

        unsafe {
            buffer.append_view_unchecked(block_id, 0, 1);
            buffer.append_view_unchecked(block_id, 1, 9);
            buffer.append_view_unchecked(block_id, 10, 31);
        }

        let valid = [true, false, false, true, false, false, true];
        let valid_mask = Buffer::from_iter(valid.iter().copied());

        buffer.pad_nulls(1, 2, valid.len() - 1, valid_mask.as_slice());

        let array = buffer.into_array(Some(valid_mask), &ArrowType::Utf8View);
        let strings = array
            .as_any()
            .downcast_ref::<arrow::array::StringViewArray>()
            .unwrap();

        assert_eq!(
            strings.iter().collect::<Vec<_>>(),
            vec![
                Some("0"),
                None,
                None,
                Some("123456789"),
                None,
                None,
                Some("long string to test string view"),
            ]
        );
    }
}