lance_arrow/
deepcopy.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use arrow_array::{make_array, Array, RecordBatch};
7use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer};
8use arrow_data::{transform::MutableArrayData, ArrayData, ArrayDataBuilder};
9
10pub fn deep_copy_buffer(buffer: &Buffer) -> Buffer {
11    Buffer::from(buffer.as_slice())
12}
13
14pub fn deep_copy_nulls(nulls: Option<&NullBuffer>) -> Option<NullBuffer> {
15    let nulls = nulls?;
16    let bit_buffer = deep_copy_buffer(nulls.inner().inner());
17    Some(unsafe {
18        NullBuffer::new_unchecked(
19            BooleanBuffer::new(bit_buffer, nulls.offset(), nulls.len()),
20            nulls.null_count(),
21        )
22    })
23}
24
25pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData {
26    let data_type = data.data_type().clone();
27    let len = data.len();
28    let nulls = deep_copy_nulls(data.nulls());
29    let offset = data.offset();
30    let buffers = data
31        .buffers()
32        .iter()
33        .map(deep_copy_buffer)
34        .collect::<Vec<_>>();
35    let child_data = data
36        .child_data()
37        .iter()
38        .map(deep_copy_array_data)
39        .collect::<Vec<_>>();
40    unsafe {
41        ArrayDataBuilder::new(data_type)
42            .len(len)
43            .nulls(nulls)
44            .offset(offset)
45            .buffers(buffers)
46            .child_data(child_data)
47            .build_unchecked()
48    }
49}
50
51pub fn deep_copy_array(array: &dyn Array) -> Arc<dyn Array> {
52    let data = array.to_data();
53    let data = deep_copy_array_data(&data);
54    make_array(data)
55}
56
57pub fn deep_copy_batch(batch: &RecordBatch) -> crate::Result<RecordBatch> {
58    let arrays = batch
59        .columns()
60        .iter()
61        .map(|array| deep_copy_array(array))
62        .collect::<Vec<_>>();
63    RecordBatch::try_new(batch.schema(), arrays)
64}
65
66/// Deep copy array data, extracting only the sliced portion using MutableArrayData
67/// This is the most efficient and correct way to copy just the sliced data
68pub fn deep_copy_array_data_sliced(data: &ArrayData) -> ArrayData {
69    // Use MutableArrayData to efficiently copy just the slice
70    let mut mutable = MutableArrayData::new(vec![data], false, data.len());
71
72    // Copy from offset to offset+len (the visible slice)
73    mutable.extend(0, data.offset(), data.offset() + data.len());
74
75    // Freeze into immutable ArrayData
76    mutable.freeze()
77}
78
79/// Deep copy an array, extracting only the sliced portion using MutableArrayData
80pub fn deep_copy_array_sliced(array: &dyn Array) -> Arc<dyn Array> {
81    let data = array.to_data();
82    let data = deep_copy_array_data_sliced(&data);
83    make_array(data)
84}
85
86/// Deep copy a RecordBatch, extracting only the sliced portion using MutableArrayData
87pub fn deep_copy_batch_sliced(batch: &RecordBatch) -> crate::Result<RecordBatch> {
88    let arrays = batch
89        .columns()
90        .iter()
91        .map(|array| deep_copy_array_sliced(array))
92        .collect::<Vec<_>>();
93    RecordBatch::try_new(batch.schema(), arrays)
94}
95
96#[cfg(test)]
97pub mod tests {
98    use std::sync::Arc;
99
100    use arrow_array::{Array, Int32Array, RecordBatch, StringArray};
101    use arrow_schema::{DataType, Field, Schema};
102
103    #[test]
104    fn test_deep_copy_sliced_array_with_nulls() {
105        let array = Arc::new(Int32Array::from(vec![
106            Some(1),
107            None,
108            Some(3),
109            None,
110            Some(5),
111        ]));
112        let sliced_array = array.slice(1, 3);
113        let copied_array = super::deep_copy_array(&sliced_array);
114        assert_eq!(sliced_array.len(), copied_array.len());
115        assert_eq!(sliced_array.nulls(), copied_array.nulls());
116    }
117
118    #[test]
119    fn test_deep_copy_array_data_sliced() {
120        let array = Int32Array::from((0..1000).collect::<Vec<i32>>());
121        let sliced = array.slice(100, 10);
122
123        let sliced_data = sliced.to_data();
124        let copied_data = super::deep_copy_array_data_sliced(&sliced_data);
125
126        assert_eq!(copied_data.len(), 10);
127        assert_eq!(copied_data.offset(), 0);
128
129        // Verify data correctness
130        let copied_array = Int32Array::from(copied_data);
131        for i in 0..10 {
132            assert_eq!(copied_array.value(i), 100 + i as i32);
133        }
134    }
135
136    #[test]
137    fn test_deep_copy_array_sliced() {
138        let array = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
139        let sliced = array.slice(1, 3);
140
141        let copied = super::deep_copy_array_sliced(&sliced);
142
143        assert_eq!(copied.len(), 3);
144        let copied_int = copied.as_any().downcast_ref::<Int32Array>().unwrap();
145        assert_eq!(copied_int.value(0), 2);
146        assert_eq!(copied_int.value(1), 3);
147        assert_eq!(copied_int.value(2), 4);
148    }
149
150    #[test]
151    fn test_deep_copy_batch_sliced() {
152        let schema = Arc::new(Schema::new(vec![
153            Field::new("id", DataType::Int32, false),
154            Field::new("name", DataType::Utf8, false),
155        ]));
156
157        let id_array = Arc::new(Int32Array::from((0..100).collect::<Vec<i32>>()));
158        let name_array = Arc::new(StringArray::from(
159            (0..100)
160                .map(|i| format!("name_{}", i))
161                .collect::<Vec<String>>(),
162        ));
163
164        let batch = RecordBatch::try_new(
165            schema,
166            vec![id_array as Arc<dyn Array>, name_array as Arc<dyn Array>],
167        )
168        .unwrap();
169
170        let sliced = batch.slice(10, 5);
171        let copied = super::deep_copy_batch_sliced(&sliced).unwrap();
172
173        assert_eq!(copied.num_rows(), 5);
174        assert_eq!(copied.num_columns(), 2);
175
176        // Verify data correctness
177        let id_col = copied
178            .column(0)
179            .as_any()
180            .downcast_ref::<Int32Array>()
181            .unwrap();
182        let name_col = copied
183            .column(1)
184            .as_any()
185            .downcast_ref::<StringArray>()
186            .unwrap();
187
188        for i in 0..5 {
189            assert_eq!(id_col.value(i), 10 + i as i32);
190            assert_eq!(name_col.value(i), format!("name_{}", 10 + i));
191        }
192    }
193
194    #[test]
195    fn test_deep_copy_array_sliced_with_nulls() {
196        let array = Arc::new(Int32Array::from(vec![
197            Some(1),
198            None,
199            Some(3),
200            None,
201            Some(5),
202        ]));
203        let sliced = array.slice(1, 3); // [None, Some(3), None]
204
205        let copied = super::deep_copy_array_sliced(&sliced);
206
207        assert_eq!(copied.len(), 3);
208        assert_eq!(copied.null_count(), 2); // Two nulls in the slice
209
210        let copied_int = copied.as_any().downcast_ref::<Int32Array>().unwrap();
211        assert!(!copied_int.is_valid(0)); // None
212        assert!(copied_int.is_valid(1)); // Some(3)
213        assert!(!copied_int.is_valid(2)); // None
214        assert_eq!(copied_int.value(1), 3);
215    }
216}