Skip to main content

anyback_reader/
archive.rs

1use std::{
2    collections::BTreeSet,
3    fs,
4    io::Read,
5    path::{Path, PathBuf},
6    sync::{Arc, Mutex},
7};
8
9use anyhow::{Context, Result, anyhow};
10use serde::Serialize;
11use zip::ZipArchive;
12
13#[derive(Debug, Clone, Serialize)]
14pub struct ArchiveFileEntry {
15    pub path: String,
16    pub bytes: u64,
17}
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum ArchiveSourceKind {
21    Directory,
22    Zip,
23}
24
25impl ArchiveSourceKind {
26    pub fn as_str(self) -> &'static str {
27        match self {
28            Self::Directory => "directory",
29            Self::Zip => "zip",
30        }
31    }
32}
33
34#[derive(Clone)]
35pub struct ArchiveReader {
36    root: PathBuf,
37    source: ArchiveSourceKind,
38    zip: Option<ZipReaderState>,
39}
40
41#[derive(Clone)]
42struct ZipReaderState {
43    archive: Arc<Mutex<ZipArchive<fs::File>>>,
44    files: Arc<Vec<ArchiveFileEntry>>,
45}
46
47impl std::fmt::Debug for ArchiveReader {
48    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
49        f.debug_struct("ArchiveReader")
50            .field("root", &self.root)
51            .field("source", &self.source)
52            .finish_non_exhaustive()
53    }
54}
55
56impl ArchiveReader {
57    pub fn from_path(path: &Path) -> Result<Self> {
58        if path.is_dir() {
59            return Ok(Self {
60                root: path.to_path_buf(),
61                source: ArchiveSourceKind::Directory,
62                zip: None,
63            });
64        }
65        if path.is_file() {
66            let file = fs::File::open(path)
67                .with_context(|| format!("failed to open archive file {}", path.display()))?;
68            if let Ok(mut zip) = ZipArchive::new(file) {
69                let mut files = Vec::new();
70                for idx in 0..zip.len() {
71                    let entry = zip.by_index(idx)?;
72                    if entry.is_dir() {
73                        continue;
74                    }
75                    files.push(ArchiveFileEntry {
76                        path: entry.name().to_string(),
77                        bytes: entry.size(),
78                    });
79                }
80                files.sort_by(|a, b| a.path.cmp(&b.path));
81                return Ok(Self {
82                    root: path.to_path_buf(),
83                    source: ArchiveSourceKind::Zip,
84                    zip: Some(ZipReaderState {
85                        archive: Arc::new(Mutex::new(zip)),
86                        files: Arc::new(files),
87                    }),
88                });
89            }
90        }
91        Err(anyhow!(
92            "archive must be a directory or zip file: {}",
93            path.display()
94        ))
95    }
96
97    pub fn source(&self) -> ArchiveSourceKind {
98        self.source
99    }
100
101    pub fn list_files(&self) -> Result<Vec<ArchiveFileEntry>> {
102        match self.source {
103            ArchiveSourceKind::Directory => {
104                let mut entries = Vec::new();
105                let mut stack = vec![self.root.clone()];
106                while let Some(dir) = stack.pop() {
107                    for entry in fs::read_dir(&dir)? {
108                        let entry = entry?;
109                        let path = entry.path();
110                        if path.is_dir() {
111                            stack.push(path);
112                            continue;
113                        }
114                        let rel = path.strip_prefix(&self.root).with_context(|| {
115                            format!("archive file not under root: {}", path.display())
116                        })?;
117                        let meta = entry.metadata()?;
118                        entries.push(ArchiveFileEntry {
119                            path: rel.to_string_lossy().into_owned(),
120                            bytes: meta.len(),
121                        });
122                    }
123                }
124                entries.sort_by(|a, b| a.path.cmp(&b.path));
125                Ok(entries)
126            }
127            ArchiveSourceKind::Zip => {
128                let state = self.zip_state()?;
129                Ok(state.files.as_ref().clone())
130            }
131        }
132    }
133
134    pub fn read_bytes(&self, rel_path: &str) -> Result<Vec<u8>> {
135        match self.source {
136            ArchiveSourceKind::Directory => {
137                let path = self.root.join(rel_path);
138                fs::read(&path)
139                    .with_context(|| format!("failed to read archive file {}", path.display()))
140            }
141            ArchiveSourceKind::Zip => {
142                let state = self.zip_state()?;
143                let mut zip = state
144                    .archive
145                    .lock()
146                    .map_err(|_| anyhow!("zip archive lock poisoned"))?;
147                let mut entry = zip
148                    .by_name(rel_path)
149                    .with_context(|| format!("archive entry not found in zip: {rel_path}"))?;
150                let mut out = Vec::new();
151                entry.read_to_end(&mut out)?;
152                drop(entry);
153                drop(zip);
154                Ok(out)
155            }
156        }
157    }
158
159    pub fn read_bytes_if_exists(&self, rel_path: &str) -> Result<Option<Vec<u8>>> {
160        match self.source {
161            ArchiveSourceKind::Directory => {
162                let path = self.root.join(rel_path);
163                if !path.is_file() {
164                    return Ok(None);
165                }
166                let bytes = fs::read(&path)
167                    .with_context(|| format!("failed to read archive file {}", path.display()))?;
168                Ok(Some(bytes))
169            }
170            ArchiveSourceKind::Zip => {
171                let state = self.zip_state()?;
172                let mut zip = state
173                    .archive
174                    .lock()
175                    .map_err(|_| anyhow!("zip archive lock poisoned"))?;
176                let Ok(mut entry) = zip.by_name(rel_path) else {
177                    return Ok(None);
178                };
179                let mut out = Vec::new();
180                entry.read_to_end(&mut out)?;
181                drop(entry);
182                drop(zip);
183                Ok(Some(out))
184            }
185        }
186    }
187
188    fn zip_state(&self) -> Result<&ZipReaderState> {
189        self.zip
190            .as_ref()
191            .ok_or_else(|| anyhow!("zip archive state unavailable"))
192    }
193}
194
195fn looks_like_content_id(value: &str) -> bool {
196    value.len() >= 20
197        && value.len() <= 128
198        && value.starts_with("bafy")
199        && value
200            .chars()
201            .all(|ch| ch.is_ascii_lowercase() || ch.is_ascii_digit())
202}
203
204pub fn infer_object_id_from_snapshot_path(path: &str) -> Option<String> {
205    let file_name = Path::new(path).file_name().and_then(|s| s.to_str())?;
206    let stem = if let Some(stem) = file_name.strip_suffix(".pb.json") {
207        stem
208    } else if let Some(stem) = file_name.strip_suffix(".pb") {
209        stem
210    } else {
211        return None;
212    };
213    looks_like_content_id(stem).then(|| stem.to_string())
214}
215
216pub fn infer_object_ids_from_files(files: &[ArchiveFileEntry]) -> Vec<String> {
217    let mut ids = BTreeSet::new();
218    for entry in files {
219        let path = Path::new(&entry.path);
220        let under_objects = path
221            .components()
222            .next()
223            .and_then(|c| c.as_os_str().to_str())
224            .is_some_and(|root| root == "objects");
225        if !under_objects {
226            continue;
227        }
228        if let Some(id) = infer_object_id_from_snapshot_path(&entry.path) {
229            ids.insert(id);
230        }
231    }
232    ids.into_iter().collect()
233}
234
235#[cfg(test)]
236mod tests {
237    use std::io::Write;
238
239    use super::*;
240
241    #[test]
242    fn reader_lists_and_reads_directory_archive() {
243        let temp = tempfile::tempdir().unwrap();
244        fs::create_dir_all(temp.path().join("objects")).unwrap();
245        fs::write(temp.path().join("manifest.json"), b"{}").unwrap();
246        fs::write(temp.path().join("objects/obj.pb"), b"payload").unwrap();
247
248        let reader = ArchiveReader::from_path(temp.path()).unwrap();
249        assert_eq!(reader.source(), ArchiveSourceKind::Directory);
250
251        let files = reader.list_files().unwrap();
252        assert!(files.iter().any(|entry| entry.path == "manifest.json"));
253        assert!(files.iter().any(|entry| entry.path == "objects/obj.pb"));
254        assert_eq!(reader.read_bytes("objects/obj.pb").unwrap(), b"payload");
255    }
256
257    #[test]
258    fn reader_lists_and_reads_zip_archive() {
259        let temp = tempfile::tempdir().unwrap();
260        let zip_path = temp.path().join("archive.zip");
261        let file = fs::File::create(&zip_path).unwrap();
262        let mut writer = zip::ZipWriter::new(file);
263        writer
264            .start_file("manifest.json", zip::write::SimpleFileOptions::default())
265            .unwrap();
266        writer.write_all(b"{}").unwrap();
267        writer
268            .start_file("objects/obj.pb", zip::write::SimpleFileOptions::default())
269            .unwrap();
270        writer.write_all(b"payload").unwrap();
271        writer.finish().unwrap();
272
273        let reader = ArchiveReader::from_path(&zip_path).unwrap();
274        assert_eq!(reader.source(), ArchiveSourceKind::Zip);
275
276        let files = reader.list_files().unwrap();
277        assert!(files.iter().any(|entry| entry.path == "manifest.json"));
278        assert!(files.iter().any(|entry| entry.path == "objects/obj.pb"));
279        assert_eq!(reader.read_bytes("objects/obj.pb").unwrap(), b"payload");
280    }
281
282    #[test]
283    fn infer_object_id_accepts_bafy_id_stems() {
284        let id = "bafyreiaebddr63d7sye3eggmtkyeioqxftoaipobsynceksj6faedvd2xi";
285        let path = format!("objects/{id}.pb");
286        assert_eq!(
287            infer_object_id_from_snapshot_path(&path),
288            Some(id.to_string())
289        );
290    }
291}