Skip to main content

ferray_io/
datasource.rs

1// ferray-io: DataSource — local-file abstraction with transparent decompression.
2//
3// Analogue of `numpy.lib.npyio.DataSource`. NumPy's class is mostly a
4// path-resolution + URL-caching layer that returns file-like objects;
5// ferray's version covers the local-file half (the majority use case)
6// and transparently decompresses .gz inputs via flate2 (already in
7// workspace deps). URL fetching is intentionally not included — for
8// remote files the caller should download first via the project's
9// safe-fetch path and then point DataSource at the local file.
10
11use std::fs::File;
12use std::io::{BufReader, Read};
13use std::path::{Path, PathBuf};
14
15use ferray_core::error::{FerrayError, FerrayResult};
16use flate2::read::GzDecoder;
17
18/// Reader handle returned by [`DataSource::open`]. Implements [`Read`].
19pub enum DataSourceReader {
20    /// Plain (uncompressed) file.
21    Plain(BufReader<File>),
22    /// Gzip-compressed file.
23    Gzip(Box<GzDecoder<BufReader<File>>>),
24}
25
26impl Read for DataSourceReader {
27    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
28        match self {
29            Self::Plain(r) => r.read(buf),
30            Self::Gzip(r) => r.read(buf),
31        }
32    }
33}
34
35/// Local-file abstraction with transparent gzip decompression.
36///
37/// Analogue of `numpy.lib.npyio.DataSource`. Wraps an optional base
38/// directory; `open(name)` resolves `name` relative to the base (if any)
39/// and opens the file, transparently decompressing it when the path ends
40/// in `.gz`.
41///
42/// URL fetching, the `bz2`/`zstd` formats, and the auto-cache behavior
43/// of NumPy's class are not implemented — for remote sources, fetch via
44/// the project's safe-fetch path first, then point DataSource at the
45/// local copy.
46#[derive(Debug, Clone, Default)]
47pub struct DataSource {
48    base: Option<PathBuf>,
49}
50
51impl DataSource {
52    /// Create a new DataSource. If `base` is `Some`, paths passed to
53    /// [`open`](Self::open) and [`exists`](Self::exists) are resolved
54    /// relative to it.
55    #[must_use]
56    pub fn new(base: Option<PathBuf>) -> Self {
57        Self { base }
58    }
59
60    /// Resolve a name (relative to `base`, if set) to an absolute path.
61    ///
62    /// # Errors
63    /// Returns `FerrayError::IoError` if `std::path::absolute` fails to
64    /// canonicalize the path (e.g. permission errors on the parent).
65    pub fn abspath(&self, name: &str) -> FerrayResult<PathBuf> {
66        let p: PathBuf = self
67            .base
68            .as_ref()
69            .map_or_else(|| PathBuf::from(name), |b| b.join(name));
70        std::path::absolute(&p)
71            .map_err(|e| FerrayError::io_error(format!("DataSource::abspath: {e}")))
72    }
73
74    /// Whether the named file exists relative to the base directory.
75    #[must_use]
76    pub fn exists(&self, name: &str) -> bool {
77        let p = self
78            .base
79            .as_ref()
80            .map_or_else(|| PathBuf::from(name), |b| b.join(name));
81        p.exists()
82    }
83
84    /// Open a file for reading.
85    ///
86    /// If `name` ends in `.gz`, the returned reader transparently
87    /// decompresses on the fly.
88    ///
89    /// # Errors
90    /// Returns `FerrayError::IoError` if the file cannot be opened.
91    pub fn open(&self, name: &str) -> FerrayResult<DataSourceReader> {
92        let p = self
93            .base
94            .as_ref()
95            .map_or_else(|| PathBuf::from(name), |b| b.join(name));
96        Self::open_path(&p)
97    }
98
99    /// Open an absolute path directly, bypassing the configured base.
100    ///
101    /// # Errors
102    /// Returns `FerrayError::IoError` if the file cannot be opened.
103    pub fn open_path<P: AsRef<Path>>(path: P) -> FerrayResult<DataSourceReader> {
104        let path = path.as_ref();
105        let f = File::open(path).map_err(|e| {
106            FerrayError::io_error(format!(
107                "DataSource::open: failed to open '{}': {e}",
108                path.display()
109            ))
110        })?;
111        let buf = BufReader::new(f);
112        if matches!(path.extension().and_then(|e| e.to_str()), Some("gz")) {
113            Ok(DataSourceReader::Gzip(Box::new(GzDecoder::new(buf))))
114        } else {
115            Ok(DataSourceReader::Plain(buf))
116        }
117    }
118}
119
120#[cfg(test)]
121mod tests {
122    use super::*;
123    use flate2::Compression;
124    use flate2::write::GzEncoder;
125    use std::io::Write;
126
127    fn temp_dir() -> PathBuf {
128        let p = std::env::temp_dir().join(format!(
129            "ferray_io_datasource_{}_{}",
130            std::process::id(),
131            std::time::SystemTime::now()
132                .duration_since(std::time::UNIX_EPOCH)
133                .unwrap()
134                .as_nanos()
135        ));
136        std::fs::create_dir_all(&p).unwrap();
137        p
138    }
139
140    #[test]
141    fn datasource_open_plain() {
142        let dir = temp_dir();
143        let path = dir.join("hello.txt");
144        std::fs::write(&path, b"hello world").unwrap();
145
146        let ds = DataSource::new(Some(dir.clone()));
147        let mut r = ds.open("hello.txt").unwrap();
148        let mut buf = String::new();
149        r.read_to_string(&mut buf).unwrap();
150        assert_eq!(buf, "hello world");
151
152        std::fs::remove_dir_all(&dir).unwrap();
153    }
154
155    #[test]
156    fn datasource_open_gzip() {
157        let dir = temp_dir();
158        let path = dir.join("greet.txt.gz");
159        let f = File::create(&path).unwrap();
160        let mut enc = GzEncoder::new(f, Compression::default());
161        enc.write_all(b"compressed payload").unwrap();
162        enc.finish().unwrap();
163
164        let ds = DataSource::new(Some(dir.clone()));
165        let mut r = ds.open("greet.txt.gz").unwrap();
166        let mut buf = String::new();
167        r.read_to_string(&mut buf).unwrap();
168        assert_eq!(buf, "compressed payload");
169
170        std::fs::remove_dir_all(&dir).unwrap();
171    }
172
173    #[test]
174    fn datasource_exists_and_abspath() {
175        let dir = temp_dir();
176        std::fs::write(dir.join("present.txt"), b"x").unwrap();
177
178        let ds = DataSource::new(Some(dir.clone()));
179        assert!(ds.exists("present.txt"));
180        assert!(!ds.exists("missing.txt"));
181        let abs = ds.abspath("present.txt").unwrap();
182        assert!(abs.is_absolute());
183
184        std::fs::remove_dir_all(&dir).unwrap();
185    }
186
187    #[test]
188    fn datasource_open_missing_file_errs() {
189        let ds = DataSource::new(None);
190        assert!(ds.open("/nonexistent/path/file.txt").is_err());
191    }
192}