Skip to main content

sedona_testing/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use std::{env, fs};
18
19use datafusion_common::Result;
20use sedona_common::sedona_internal_err;
21
22/// Find the most likely path to the test GeoParquet file
23///
24/// See <https://github.com/geoarrow/geooarrow-data> for available files. Most files
25/// are available from a naive submodule checkout; however, some must be downloaded
26/// (e.g., for benchmarks).
27pub fn test_geoparquet(group: &str, name: &str) -> Result<String> {
28    let geoarrow_data = geoarrow_data_dir()?;
29    let path = format!("{geoarrow_data}/{group}/files/{group}_{name}_geo.parquet");
30    if let Ok(exists) = fs::exists(&path) {
31        if exists {
32            return Ok(path);
33        }
34    }
35
36    sedona_internal_err!(
37        "geoarrow-data test file '{path}' does not exist.\n{}\n{}",
38        "You may need to check the value of the SEDONA_GEOARROW_DATA_DIR environment variable,",
39        "run submodules/download-assets.py, or check the name of the file you requested"
40    )
41}
42
43/// Find the most likely path to the geoarrow-data testing directory if it exists
44///
45/// This looks for a geoarrow-data checkout using the value of SEDONA_GEOARROW_DATA_DIR,
46/// the directory that would be valid if running cargo run from the repository root,
47/// or the directory that would be valid if running cargo test (in that order).
48pub fn geoarrow_data_dir() -> Result<String> {
49    // Always use env-specified and error if it doesn't exist
50    if let Ok(from_env) = env::var("SEDONA_GEOARROW_DATA_DIR") {
51        if fs::exists(&from_env)? {
52            return Ok(from_env);
53        } else {
54            return sedona_internal_err!(
55                "{}\n{}{}{}",
56                "Can't resolve geoarrow-data from the current working directory because",
57                "the value of the SEDONA_GEOARROW_DATA_DIR (",
58                from_env,
59                ") does not exist"
60            );
61        }
62    }
63
64    let likely_possibilities = [
65        // Because we're in a cargo test from rust/some-crate
66        "../../submodules/geoarrow-data".to_string(),
67        // Because we're in the cli from cargo run
68        "submodules/geoarrow-data".to_string(),
69    ];
70
71    for possibility in likely_possibilities.into_iter().rev() {
72        if let Ok(exists) = fs::exists(&possibility) {
73            if exists {
74                return Ok(possibility);
75            }
76        }
77    }
78
79    sedona_internal_err!(
80        "{}\n{}\n{}",
81        "Can't resolve geoarrow-data from the current working directory",
82        "You may need to run `git submodule init && git submodule update --recursive` or",
83        "set the SEDONA_GEOARROW_DATA_DIR environment variable"
84    )
85}
86
87/// Find the most likely path to the sedona-testing directory if it exists
88///
89/// This mirrors [`geoarrow_data_dir`] but for the sedona-testing submodule.
90/// It checks the `SEDONA_TESTING_DIR` environment variable first, then
91/// falls back to the typical repository-relative locations.
92pub fn sedona_testing_dir() -> Result<String> {
93    if let Ok(from_env) = env::var("SEDONA_TESTING_DIR") {
94        if fs::exists(&from_env)? {
95            return Ok(from_env);
96        } else {
97            return sedona_internal_err!(
98                "{}\n{}{}{}",
99                "Can't resolve sedona-testing directory because",
100                "the value of the SEDONA_TESTING_DIR (",
101                from_env,
102                ") does not exist"
103            );
104        }
105    }
106
107    let likely_possibilities = [
108        "../../submodules/sedona-testing".to_string(),
109        "submodules/sedona-testing".to_string(),
110    ];
111
112    for possibility in likely_possibilities.into_iter().rev() {
113        if let Ok(exists) = fs::exists(&possibility) {
114            if exists {
115                return Ok(possibility);
116            }
117        }
118    }
119
120    sedona_internal_err!(
121        "{}\n{}\n{}",
122        "Can't resolve sedona-testing directory from the current working directory",
123        "You may need to run `git submodule init && git submodule update --recursive` or",
124        "set the SEDONA_TESTING_DIR environment variable"
125    )
126}
127
128#[cfg(test)]
129mod test {
130    use super::*;
131    use std::sync::Mutex;
132
133    // These tests mutate global states including environment variables so they must
134    // run serially. The SERIAL_TEST mutex ensures that only one test executes at a time,
135    // preventing race conditions when modifying and restoring environment variables.
136    static SERIAL_TEST: Mutex<()> = Mutex::new(());
137
138    #[test]
139    fn example_files() {
140        let _guard = SERIAL_TEST.lock().unwrap();
141
142        // By default this should resolve, since we are in a test!
143        assert!(geoarrow_data_dir().is_ok());
144        assert!(test_geoparquet("natural-earth", "countries").is_ok());
145
146        // Check a good data dir but a bad file
147        let err = test_geoparquet("invalid group", "invalid name").unwrap_err();
148        assert!(err.message().contains("geoarrow-data test file"));
149
150        // Check a bad data dir
151        env::set_var("SEDONA_GEOARROW_DATA_DIR", "this_directory_does_not_exist");
152        let err = geoarrow_data_dir();
153        env::remove_var("SEDONA_GEOARROW_DATA_DIR");
154        assert!(err
155            .unwrap_err()
156            .message()
157            .contains("the value of the SEDONA_GEOARROW_DATA_DIR"));
158
159        // Check a good but explicitly specified data dir
160        env::set_var("SEDONA_GEOARROW_DATA_DIR", geoarrow_data_dir().unwrap());
161        let maybe_file = test_geoparquet("natural-earth", "countries");
162        env::remove_var("SEDONA_GEOARROW_DATA_DIR");
163        assert!(maybe_file.is_ok());
164    }
165
166    #[test]
167    fn sedona_testing_dir_resolves() {
168        let _guard = SERIAL_TEST.lock().unwrap();
169
170        assert!(sedona_testing_dir().is_ok());
171
172        env::set_var("SEDONA_TESTING_DIR", "this_directory_does_not_exist");
173        let err = sedona_testing_dir();
174        env::remove_var("SEDONA_TESTING_DIR");
175        assert!(err
176            .unwrap_err()
177            .message()
178            .contains("the value of the SEDONA_TESTING_DIR"));
179
180        env::set_var("SEDONA_TESTING_DIR", sedona_testing_dir().unwrap());
181        let maybe_dir = sedona_testing_dir();
182        env::remove_var("SEDONA_TESTING_DIR");
183        assert!(maybe_dir.is_ok());
184    }
185}