Skip to main content

sedona_testing/
read.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use std::fs::File;
18
19use arrow_array::{ArrayRef, RecordBatchReader};
20use datafusion_common::{DataFusionError, Result};
21use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
22use sedona_common::sedona_internal_err;
23use sedona_schema::datatypes::SedonaType;
24
25use crate::data::test_geoparquet;
26
27/// Options for test file readers
28#[derive(Debug, Clone)]
29pub struct TestReadOptions {
30    /// Type to use for geometry when reading test files
31    pub sedona_type: SedonaType,
32
33    /// Chunk size to output when reading test files
34    pub chunk_size: usize,
35
36    /// Approximate number of rows
37    ///
38    /// This number is approximate and the actual number will be obtained
39    /// by either truncating the input or cycling through batches until
40    /// at least this number is reached. If omitted, the entire test file
41    /// will be read.
42    pub output_size: Option<usize>,
43}
44
45impl TestReadOptions {
46    /// Create new options with defaults
47    pub fn new(sedona_type: SedonaType) -> Self {
48        TestReadOptions {
49            sedona_type,
50            chunk_size: 8192,
51            output_size: None,
52        }
53    }
54
55    /// Apply a target output size to these options
56    pub fn with_output_size(self, output_size: usize) -> Self {
57        TestReadOptions {
58            sedona_type: self.sedona_type,
59            chunk_size: self.chunk_size,
60            output_size: Some(output_size),
61        }
62    }
63}
64
65/// Read a geoarrow-data file's geometry column
66///
67/// This function is intended for reading data for benchmarks and tests
68pub fn read_geoarrow_data_geometry(
69    group: &str,
70    name: &str,
71    options: &TestReadOptions,
72) -> Result<Vec<ArrayRef>> {
73    let path = test_geoparquet(group, name)?;
74    let file = File::open(path).map_err(DataFusionError::IoError)?;
75    let reader = ParquetRecordBatchReader::try_new(file, options.chunk_size)
76        .map_err(|e| DataFusionError::External(Box::new(e)))?;
77
78    if reader.schema().fields().is_empty() {
79        return sedona_internal_err!("Unexpected schema: zero columns");
80    }
81
82    // True for all geoarrow-data files
83    let geometry_index = reader.schema().fields().len() - 1;
84    let raw_arrays = reader
85        .map(|batch| -> Result<ArrayRef> {
86            let array = batch?.column(geometry_index).clone();
87            // We may need something more sophisticated to support non-wkb geometry types
88            // This covers WKB and WKB_VIEW
89            Ok(arrow_cast::cast(
90                &array,
91                options.sedona_type.storage_type(),
92            )?)
93        })
94        .collect::<Result<Vec<_>>>()?;
95
96    apply_output_size(raw_arrays, options)
97}
98
99fn apply_output_size(arrays: Vec<ArrayRef>, options: &TestReadOptions) -> Result<Vec<ArrayRef>> {
100    if let Some(output_size) = options.output_size {
101        let mut out = Vec::new();
102        let mut i = 0;
103        let mut out_size = 0;
104        while out_size < output_size {
105            let array = &arrays[i % arrays.len()];
106            out_size += array.len();
107            i += 1;
108            out.push(array.clone());
109        }
110
111        Ok(out)
112    } else {
113        Ok(arrays)
114    }
115}
116
117#[cfg(test)]
118mod test {
119    use sedona_schema::datatypes::WKB_GEOMETRY;
120
121    use super::*;
122
123    #[test]
124    fn read() {
125        let batches =
126            read_geoarrow_data_geometry("example", "geometry", &TestReadOptions::new(WKB_GEOMETRY))
127                .unwrap();
128        assert_eq!(batches.len(), 1);
129        assert_eq!(batches[0].len(), 9);
130        assert_eq!(batches[0].data_type(), WKB_GEOMETRY.storage_type());
131
132        let options = TestReadOptions::new(WKB_GEOMETRY).with_output_size(100);
133        let batches = read_geoarrow_data_geometry("example", "geometry", &options).unwrap();
134        assert_eq!(batches.len(), 12);
135    }
136}