Skip to main content

batuta/serve/banco/
storage.rs

1//! Banco storage — manages ~/.banco/ directory structure.
2//!
3//! Provides content-addressable file storage for uploads, datasets, and runs.
4//! Files are stored with SHA-256 content hashes for deduplication.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::PathBuf;
9use std::sync::{Arc, RwLock};
10
11/// Metadata about an uploaded file.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct FileInfo {
14    pub id: String,
15    pub name: String,
16    pub size_bytes: u64,
17    pub content_type: String,
18    pub uploaded_at: u64,
19    pub content_hash: String,
20}
21
22/// Detected file content type.
23impl FileInfo {
24    fn detect_content_type(name: &str) -> String {
25        match name.rsplit('.').next().map(str::to_lowercase).as_deref() {
26            Some("pdf") => "application/pdf",
27            Some("csv") => "text/csv",
28            Some("json") => "application/json",
29            Some("jsonl") => "application/jsonl",
30            Some("txt") => "text/plain",
31            Some("docx") => {
32                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
33            }
34            _ => "application/octet-stream",
35        }
36        .to_string()
37    }
38}
39
40/// File store — in-memory index with optional disk backing.
41pub struct FileStore {
42    files: RwLock<HashMap<String, FileInfo>>,
43    /// In-memory content cache (keyed by file ID).
44    content: RwLock<HashMap<String, Vec<u8>>>,
45    data_dir: Option<PathBuf>,
46    counter: std::sync::atomic::AtomicU64,
47}
48
49impl FileStore {
50    /// Create an in-memory-only store (for testing).
51    #[must_use]
52    pub fn in_memory() -> Arc<Self> {
53        Arc::new(Self {
54            files: RwLock::new(HashMap::new()),
55            content: RwLock::new(HashMap::new()),
56            data_dir: None,
57            counter: std::sync::atomic::AtomicU64::new(0),
58        })
59    }
60
61    /// Create a store backed by a directory.
62    #[must_use]
63    pub fn with_data_dir(dir: PathBuf) -> Arc<Self> {
64        let uploads_dir = dir.join("uploads");
65        let _ = std::fs::create_dir_all(&uploads_dir);
66
67        // Load existing file metadata from disk
68        let mut files = HashMap::new();
69        let mut max_seq = 0u64;
70        if let Ok(entries) = std::fs::read_dir(&uploads_dir) {
71            for entry in entries.flatten() {
72                let path = entry.path();
73                if path.extension().and_then(|e| e.to_str()) == Some("json") {
74                    if let Ok(data) = std::fs::read_to_string(&path) {
75                        if let Ok(info) = serde_json::from_str::<FileInfo>(&data) {
76                            // Extract sequence number from ID for counter
77                            if let Some(seq_str) = info.id.rsplit('-').next() {
78                                if let Ok(seq) = seq_str.parse::<u64>() {
79                                    max_seq = max_seq.max(seq + 1);
80                                }
81                            }
82                            files.insert(info.id.clone(), info);
83                        }
84                    }
85                }
86            }
87        }
88
89        let loaded = files.len();
90        if loaded > 0 {
91            eprintln!("[banco] Loaded {loaded} files from {}", uploads_dir.display());
92        }
93
94        Arc::new(Self {
95            files: RwLock::new(files),
96            content: RwLock::new(HashMap::new()),
97            data_dir: Some(dir),
98            counter: std::sync::atomic::AtomicU64::new(max_seq),
99        })
100    }
101
102    /// Store a file, returning its metadata.
103    pub fn store(&self, name: &str, data: &[u8]) -> FileInfo {
104        let seq = self.counter.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
105        let id = format!("file-{}-{seq}", epoch_secs());
106        let content_hash = sha256_hex(data);
107
108        let info = FileInfo {
109            id: id.clone(),
110            name: name.to_string(),
111            size_bytes: data.len() as u64,
112            content_type: FileInfo::detect_content_type(name),
113            uploaded_at: epoch_secs(),
114            content_hash: content_hash.clone(),
115        };
116
117        // Write to disk if configured
118        if let Some(ref dir) = self.data_dir {
119            let path = dir.join("uploads").join(&content_hash);
120            let _ = std::fs::write(path, data);
121            // Also write metadata
122            let meta_path = dir.join("uploads").join(format!("{content_hash}.meta.json"));
123            let _ =
124                std::fs::write(meta_path, serde_json::to_string_pretty(&info).unwrap_or_default());
125        }
126
127        if let Ok(mut store) = self.files.write() {
128            store.insert(id.clone(), info.clone());
129        }
130
131        // Always cache content in memory for read_content()
132        if let Ok(mut cache) = self.content.write() {
133            cache.insert(id, data.to_vec());
134        }
135
136        info
137    }
138
139    /// List all files (most recent first).
140    #[must_use]
141    pub fn list(&self) -> Vec<FileInfo> {
142        let store = self.files.read().unwrap_or_else(|e| e.into_inner());
143        let mut files: Vec<FileInfo> = store.values().cloned().collect();
144        files.sort_by(|a, b| b.uploaded_at.cmp(&a.uploaded_at));
145        files
146    }
147
148    /// Get file metadata by ID.
149    #[must_use]
150    pub fn get(&self, id: &str) -> Option<FileInfo> {
151        self.files.read().unwrap_or_else(|e| e.into_inner()).get(id).cloned()
152    }
153
154    /// Get file content by ID (checks memory cache first, then disk).
155    #[must_use]
156    pub fn read_content(&self, id: &str) -> Option<Vec<u8>> {
157        // Check in-memory cache first
158        if let Ok(cache) = self.content.read() {
159            if let Some(data) = cache.get(id) {
160                return Some(data.clone());
161            }
162        }
163        // Fall back to disk
164        let info = self.get(id)?;
165        if let Some(ref dir) = self.data_dir {
166            let path = dir.join("uploads").join(&info.content_hash);
167            std::fs::read(path).ok()
168        } else {
169            None
170        }
171    }
172
173    /// Delete a file by ID.
174    pub fn delete(&self, id: &str) -> Result<(), StorageError> {
175        let info = {
176            let mut store = self.files.write().map_err(|_| StorageError::LockPoisoned)?;
177            store.remove(id).ok_or(StorageError::NotFound(id.to_string()))?
178        };
179
180        // Remove from caches
181        if let Ok(mut cache) = self.content.write() {
182            cache.remove(id);
183        }
184        if let Some(ref dir) = self.data_dir {
185            let _ = std::fs::remove_file(dir.join("uploads").join(&info.content_hash));
186            let _ = std::fs::remove_file(
187                dir.join("uploads").join(format!("{}.meta.json", info.content_hash)),
188            );
189        }
190
191        Ok(())
192    }
193
194    /// Number of stored files.
195    #[must_use]
196    pub fn len(&self) -> usize {
197        self.files.read().map(|s| s.len()).unwrap_or(0)
198    }
199
200    /// Check if empty.
201    #[must_use]
202    pub fn is_empty(&self) -> bool {
203        self.len() == 0
204    }
205}
206
207/// Storage errors.
208#[derive(Debug, Clone, PartialEq, Eq)]
209pub enum StorageError {
210    NotFound(String),
211    LockPoisoned,
212}
213
214impl std::fmt::Display for StorageError {
215    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
216        match self {
217            Self::NotFound(id) => write!(f, "File not found: {id}"),
218            Self::LockPoisoned => write!(f, "Internal lock error"),
219        }
220    }
221}
222
223impl std::error::Error for StorageError {}
224
225fn epoch_secs() -> u64 {
226    std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap_or_default().as_secs()
227}
228
229/// Simple SHA-256 hash (first 16 hex chars for dedup).
230fn sha256_hex(data: &[u8]) -> String {
231    // FNV-1a 128-bit for content-addressable storage (not crypto — just dedup)
232    let mut h1: u64 = 0xcbf2_9ce4_8422_2325;
233    let mut h2: u64 = 0x6c62_272e_07bb_0142;
234    for &byte in data {
235        h1 ^= byte as u64;
236        h1 = h1.wrapping_mul(0x0100_0000_01b3);
237        h2 ^= byte as u64;
238        h2 = h2.wrapping_mul(0x0000_0100_0000_01b3);
239    }
240    format!("{h1:016x}{h2:016x}")
241}