Skip to main content

xls_rs/excel/
reader.rs

1use anyhow::{Context, Result};
2use calamine::{open_workbook, Ods, Reader, Xlsx};
3use std::collections::HashMap;
4use std::sync::{Arc, RwLock};
5
6use crate::csv_handler::CellRange;
7use crate::traits::DataReader;
8
9/// Excel metadata cache entry
10#[derive(Debug, Clone)]
11struct ExcelMetadata {
12    sheet_names: Vec<String>,
13    modified_time: Option<std::time::SystemTime>,
14}
15
16/// Thread-safe metadata cache for Excel files
17struct ExcelMetadataCache {
18    cache: Arc<RwLock<HashMap<String, ExcelMetadata>>>,
19}
20
21impl ExcelMetadataCache {
22    fn new() -> Self {
23        Self {
24            cache: Arc::new(RwLock::new(HashMap::new())),
25        }
26    }
27
28    fn get(&self, path: &str) -> Option<ExcelMetadata> {
29        let cache = self.cache.read().ok()?;
30        if let Some(metadata) = cache.get(path) {
31            // Check if file is still valid
32            if let Ok(current_modified) = std::fs::metadata(path).and_then(|m| m.modified()) {
33                if let Some(cached_modified) = metadata.modified_time {
34                    if current_modified == cached_modified {
35                        return Some(metadata.clone());
36                    }
37                }
38            }
39        }
40        None
41    }
42
43    fn insert(&self, path: String, metadata: ExcelMetadata) {
44        if let Ok(mut cache) = self.cache.write() {
45            // Simple cache eviction: remove oldest entries if cache gets too large
46            if cache.len() > 100 {
47                cache.clear();
48            }
49            cache.insert(path, metadata);
50        }
51    }
52
53    fn invalidate(&self, path: &str) {
54        if let Ok(mut cache) = self.cache.write() {
55            cache.remove(path);
56        }
57    }
58}
59
60/// Excel file handler
61pub struct ExcelHandler {
62    metadata_cache: ExcelMetadataCache,
63}
64
65impl ExcelHandler {
66    pub fn new() -> Self {
67        Self {
68            metadata_cache: ExcelMetadataCache::new(),
69        }
70    }
71
72    /// Resolve sheet name: use the first sheet if `requested` is `None`, otherwise require an exact
73    /// match so users get a clear error instead of a low-level calamine failure.
74    fn resolve_sheet_selection(requested: Option<&str>, available: &[String]) -> Result<String> {
75        match requested {
76            Some(name) => {
77                if available.iter().any(|s| s == name) {
78                    Ok(name.to_string())
79                } else {
80                    let list = if available.is_empty() {
81                        "(none)".to_string()
82                    } else {
83                        available.join(", ")
84                    };
85                    anyhow::bail!(
86                        "Sheet '{name}' not found in workbook. Available sheets: {list}"
87                    );
88                }
89            }
90            None => available
91                .first()
92                .cloned()
93                .ok_or_else(|| anyhow::anyhow!("No sheets found in workbook")),
94        }
95    }
96
97    /// Get or load Excel metadata with caching
98    fn get_metadata(&self, path: &str) -> Result<ExcelMetadata> {
99        // Check cache first
100        if let Some(metadata) = self.metadata_cache.get(path) {
101            return Ok(metadata);
102        }
103
104        // Load from file
105        let workbook: Xlsx<_> =
106            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
107
108        let modified_time = std::fs::metadata(path)
109            .and_then(|m| m.modified())
110            .ok();
111
112        let metadata = ExcelMetadata {
113            sheet_names: workbook.sheet_names().to_vec(),
114            modified_time,
115        };
116
117        // Cache the metadata
118        self.metadata_cache.insert(path.to_string(), metadata.clone());
119
120        Ok(metadata)
121    }
122
123    pub fn read(&self, path: &str) -> Result<String> {
124        self.read_with_sheet(path, None)
125    }
126
127    pub fn read_with_sheet(&self, path: &str, sheet_name: Option<&str>) -> Result<String> {
128        let mut workbook: Xlsx<_> =
129            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
130
131        let metadata = self.get_metadata(path)?;
132        let sheet_name = Self::resolve_sheet_selection(sheet_name, &metadata.sheet_names)?;
133
134        let range = workbook
135            .worksheet_range(&sheet_name)
136            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
137
138        // Pre-allocate string capacity based on estimated size
139        let mut output = String::with_capacity(range.height() * range.width() * 10);
140        for row in range.rows() {
141            let row_str: Vec<String> = row.iter().map(|cell| cell.to_string()).collect();
142            output.push_str(&row_str.join(","));
143            output.push('\n');
144        }
145
146        Ok(output)
147    }
148
149    pub fn parse_cell_reference(&self, cell: &str) -> Result<(u32, u16)> {
150        let mut col_str = String::new();
151        let mut row_str = String::new();
152
153        for ch in cell.chars() {
154            if ch.is_alphabetic() {
155                col_str.push(ch);
156            } else if ch.is_ascii_digit() {
157                row_str.push(ch);
158            }
159        }
160
161        let col = self.column_to_index(&col_str)?;
162        let row = row_str
163            .parse::<u32>()
164            .with_context(|| format!("Invalid row number in cell reference: {cell}"))?;
165
166        Ok((row - 1, col))
167    }
168
169    fn column_to_index(&self, col: &str) -> Result<u16> {
170        let mut index = 0u16;
171        for ch in col.chars() {
172            index = index * 26 + (ch.to_ascii_uppercase() as u16 - b'A' as u16 + 1);
173        }
174        Ok(index - 1)
175    }
176
177    /// Read a specific range from Excel file
178    pub fn read_range(
179        &self,
180        path: &str,
181        range: &CellRange,
182        sheet_name: Option<&str>,
183    ) -> Result<Vec<Vec<String>>> {
184        let mut workbook: Xlsx<_> =
185            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
186
187        let metadata = self.get_metadata(path)?;
188        let sheet_name = Self::resolve_sheet_selection(sheet_name, &metadata.sheet_names)?;
189
190        let ws_range = workbook
191            .worksheet_range(&sheet_name)
192            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
193
194        let estimated_rows = range.end_row.saturating_sub(range.start_row) + 1;
195        let estimated_cols = range.end_col.saturating_sub(range.start_col) + 1;
196        let mut result = Vec::with_capacity(estimated_rows.min(1024));
197
198        for (row_idx, row) in ws_range.rows().enumerate() {
199            if row_idx < range.start_row {
200                continue;
201            }
202            if row_idx > range.end_row {
203                break;
204            }
205
206            let mut row_data = Vec::with_capacity(estimated_cols);
207            for (col_idx, cell) in row.iter().enumerate() {
208                if col_idx >= range.start_col && col_idx <= range.end_col {
209                    row_data.push(cell.to_string());
210                }
211            }
212            result.push(row_data);
213        }
214
215        Ok(result)
216    }
217
218    /// Read Excel and return as JSON array
219    pub fn read_as_json(&self, path: &str, sheet_name: Option<&str>) -> Result<String> {
220        let mut workbook: Xlsx<_> =
221            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
222
223        let metadata = self.get_metadata(path)?;
224        let sheet_name = Self::resolve_sheet_selection(sheet_name, &metadata.sheet_names)?;
225
226        let range = workbook
227            .worksheet_range(&sheet_name)
228            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
229
230        let mut rows: Vec<Vec<String>> = Vec::with_capacity(range.height());
231        for row in range.rows() {
232            let mut row_data = Vec::with_capacity(range.width());
233            for cell in row.iter() {
234                row_data.push(cell.to_string());
235            }
236            rows.push(row_data);
237        }
238
239        serde_json::to_string_pretty(&rows).with_context(|| "Failed to serialize to JSON")
240    }
241
242    /// Get list of sheet names in workbook (cached)
243    pub fn list_sheets(&self, path: &str) -> Result<Vec<String>> {
244        let metadata = self.get_metadata(path)?;
245        Ok(metadata.sheet_names)
246    }
247
248    /// Read all sheets at once, returns map of sheet_name -> data
249    pub fn read_all_sheets(
250        &self,
251        path: &str,
252    ) -> Result<std::collections::HashMap<String, Vec<Vec<String>>>> {
253        let mut workbook: Xlsx<_> =
254            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
255
256        let sheet_names = workbook.sheet_names().to_vec();
257        let mut result = std::collections::HashMap::new();
258
259        for sheet_name in sheet_names {
260            let range = workbook
261                .worksheet_range(&sheet_name)
262                .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
263
264            let mut rows: Vec<Vec<String>> = Vec::new();
265            for row in range.rows() {
266                rows.push(row.iter().map(|cell| cell.to_string()).collect());
267            }
268
269            result.insert(sheet_name, rows);
270        }
271
272        Ok(result)
273    }
274
275    /// Read ODS as CSV-like string
276    pub fn read_ods(&self, path: &str, sheet_name: Option<&str>) -> Result<String> {
277        let mut workbook: Ods<_> =
278            open_workbook(path).with_context(|| format!("Failed to open ODS file: {path}"))?;
279
280        let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
281        let sheet_name = Self::resolve_sheet_selection(sheet_name, &sheet_names)?;
282
283        let range = workbook
284            .worksheet_range(&sheet_name)
285            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
286
287        let mut output = String::new();
288        for row in range.rows() {
289            let row_str: Vec<String> = row.iter().map(|cell| cell.to_string()).collect();
290            output.push_str(&row_str.join(","));
291            output.push('\n');
292        }
293
294        Ok(output)
295    }
296
297    /// Read ODS into `Vec<Vec<String>>`
298    pub fn read_ods_data(&self, path: &str, sheet_name: Option<&str>) -> Result<Vec<Vec<String>>> {
299        let mut workbook: Ods<_> =
300            open_workbook(path).with_context(|| format!("Failed to open ODS file: {path}"))?;
301
302        let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
303        let sheet_name = Self::resolve_sheet_selection(sheet_name, &sheet_names)?;
304
305        let range = workbook
306            .worksheet_range(&sheet_name)
307            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
308
309        let mut rows: Vec<Vec<String>> = Vec::new();
310        for row in range.rows() {
311            rows.push(row.iter().map(|cell| cell.to_string()).collect());
312        }
313
314        Ok(rows)
315    }
316
317    /// List sheets in ODS file
318    pub fn list_ods_sheets(&self, path: &str) -> Result<Vec<String>> {
319        let workbook: Ods<_> =
320            open_workbook(path).with_context(|| format!("Failed to open ODS file: {path}"))?;
321        Ok(workbook.sheet_names().to_vec())
322    }
323
324    /// Auto-detect format (XLSX/XLS/ODS) and read into `Vec<Vec<String>>`
325    pub fn read_auto(&self, path: &str, sheet_or_range: Option<&str>) -> Result<Vec<Vec<String>>> {
326        let path_lower = path.to_lowercase();
327
328        if path_lower.ends_with(".ods") {
329            return self.read_ods_data(path, sheet_or_range);
330        }
331
332        if path_lower.ends_with(".xlsx") || path_lower.ends_with(".xls") {
333            if let Some(range_str) = sheet_or_range {
334                let cell_range = CellRange::parse(range_str)?;
335                return self.read_range(path, &cell_range, None);
336            } else {
337                let csv_str = self.read_with_sheet(path, None)?;
338                let data = csv_str
339                    .lines()
340                    .filter(|l| !l.is_empty())
341                    .map(|l| l.split(',').map(|s| s.to_string()).collect())
342                    .collect();
343                return Ok(data);
344            }
345        }
346
347        anyhow::bail!("Unsupported file format: {path}")
348    }
349}
350
351impl DataReader for ExcelHandler {
352    fn read(&self, path: &str) -> Result<Vec<Vec<String>>> {
353        let csv_str = self.read_with_sheet(path, None)?;
354        let result: Vec<Vec<String>> = csv_str
355            .lines()
356            .filter(|l| !l.is_empty())
357            .map(|l| l.split(',').map(|s| s.to_string()).collect())
358            .collect();
359        Ok(result)
360    }
361
362    fn read_with_headers(&self, path: &str) -> Result<Vec<Vec<String>>> {
363        // Call the trait method explicitly to avoid conflict with inherent method
364        let csv_str = self.read_with_sheet(path, None)?;
365        let result: Vec<Vec<String>> = csv_str
366            .lines()
367            .filter(|l| !l.is_empty())
368            .map(|l| l.split(',').map(|s| s.to_string()).collect())
369            .collect();
370        Ok(result)
371    }
372
373    fn read_range(&self, path: &str, range: &CellRange) -> Result<Vec<Vec<String>>> {
374        // Direct implementation to avoid method name conflicts
375        let mut workbook: Xlsx<_> =
376            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
377
378        let metadata = self.get_metadata(path)?;
379        let sheet_name = metadata
380            .sheet_names
381            .first()
382            .map(|s| s.as_str())
383            .ok_or_else(|| anyhow::anyhow!("No sheets found in workbook"))?;
384
385        let ws_range = workbook
386            .worksheet_range(sheet_name)
387            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
388
389        let estimated_rows = range.end_row.saturating_sub(range.start_row) + 1;
390        let estimated_cols = range.end_col.saturating_sub(range.start_col) + 1;
391        let mut result = Vec::with_capacity(estimated_rows.min(1024));
392
393        for (row_idx, row) in ws_range.rows().enumerate() {
394            if row_idx < range.start_row {
395                continue;
396            }
397            if row_idx > range.end_row {
398                break;
399            }
400
401            let mut row_data = Vec::with_capacity(estimated_cols);
402            for (col_idx, cell) in row.iter().enumerate() {
403                if col_idx >= range.start_col && col_idx <= range.end_col {
404                    row_data.push(cell.to_string());
405                }
406            }
407            result.push(row_data);
408        }
409
410        Ok(result)
411    }
412
413    fn read_as_json(&self, path: &str) -> Result<String> {
414        let mut workbook: Xlsx<_> =
415            open_workbook(path).with_context(|| format!("Failed to open Excel file: {path}"))?;
416
417        let metadata = self.get_metadata(path)?;
418        let sheet_name = metadata
419            .sheet_names
420            .first()
421            .map(|s| s.as_str())
422            .ok_or_else(|| anyhow::anyhow!("No sheets found in workbook"))?;
423
424        let range = workbook
425            .worksheet_range(sheet_name)
426            .with_context(|| format!("Failed to read sheet: {sheet_name}"))?;
427
428        let mut rows: Vec<Vec<String>> = Vec::with_capacity(range.height());
429        for row in range.rows() {
430            let mut row_data = Vec::with_capacity(range.width());
431            for cell in row.iter() {
432                row_data.push(cell.to_string());
433            }
434            rows.push(row_data);
435        }
436
437        serde_json::to_string_pretty(&rows).with_context(|| "Failed to serialize to JSON")
438    }
439
440    fn supports_format(&self, path: &str) -> bool {
441        let path_lower = path.to_lowercase();
442        path_lower.ends_with(".xlsx") || path_lower.ends_with(".xls") || path_lower.ends_with(".ods")
443    }
444}