libxcsv/
lib.rs

1use anyhow::{Context, Result};
2use chrono;
3
4use quick_xml::events::Event;
5use quick_xml::reader::Reader;
6use std::collections::BTreeMap;
7use std::fs::File;
8use std::io::{BufRead, BufReader};
9use std::path::Path;
10use zip::ZipArchive;
11
12/// Information about a sheet in the workbook
13/// name: The name of the sheet
14/// path_in_zip: The path to the sheet XML inside the zip archive
15#[derive(Debug, Clone)]
16pub struct SheetInfo {
17    pub name: String,
18    pub path_in_zip: String,
19}
20
21/// Information about a cell style
22#[derive(Debug, Clone, Default)]
23pub struct StyleInfo {
24    pub is_date: bool,
25}
26
27/// Open the XLSX file as a ZipArchive
28/// Returns a new ZipArchive<BufReader<File>>
29///
30/// Examples
31/// let zip = open_zip(Path::new("example.xlsx"))?;
32pub fn open_zip(path: &Path) -> Result<ZipArchive<BufReader<File>>> {
33    let file = File::open(path)?;
34
35    let reader = BufReader::new(file);
36    let zip = ZipArchive::new(reader).context("Failed to read XLSX (zip) archive")?;
37    Ok(zip)
38}
39
40/// Parse the styles.xml to extract cell styles and identify date formats
41/// Returns a vector of StyleInfo
42pub fn parse_styles<R: BufRead>(reader: R) -> Result<Vec<StyleInfo>> {
43    let mut xml = Reader::from_reader(reader);
44    let mut buf = Vec::new();
45    let mut styles = Vec::new();
46    let mut num_fmts = BTreeMap::new();
47    let mut in_cell_xfs = false;
48
49    // Helper closure to process attributes of an <xf> tag
50    let process_xf = |attrs: quick_xml::events::attributes::Attributes,
51                      num_fmts: &BTreeMap<u32, String>|
52     -> Result<StyleInfo> {
53        let mut style = StyleInfo::default();
54        let mut num_fmt_id_attr = None;
55        let mut apply_num_fmt = true;
56
57        attrs.flatten().for_each(|a| match a.key.as_ref() {
58            b"numFmtId" => {
59                num_fmt_id_attr = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
60            }
61            b"applyNumberFormat" => {
62                apply_num_fmt = String::from_utf8_lossy(&a.value).parse::<u32>().ok() == Some(1);
63            }
64            _ => {}
65        });
66
67        if apply_num_fmt {
68            if let Some(id) = num_fmt_id_attr {
69                // Check built-in formats
70                let is_builtin_date =
71                    matches!(id, 14..=22 | 27..=36 | 45..=47 | 50..=58 | 67..=71 | 75..=81);
72                if is_builtin_date {
73                    style.is_date = true;
74                } else if let Some(format_code) = num_fmts.get(&id) {
75                    // Check custom formats
76                    let lower = format_code.to_lowercase();
77                    if (lower.contains('y') || lower.contains('d') || lower.contains('m'))
78                        && !lower.contains('#')
79                    {
80                        style.is_date = true;
81                    }
82                }
83            }
84        }
85        Ok(style)
86    };
87
88    loop {
89        match xml.read_event_into(&mut buf) {
90            Ok(Event::Start(e)) => match e.name().as_ref() {
91                b"numFmt" => {
92                    let mut num_fmt_id = None;
93                    let mut format_code = None;
94
95                    e.attributes().flatten().for_each(|a| match a.key.as_ref() {
96                        b"numFmtId" => {
97                            num_fmt_id = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
98                        }
99                        b"formatCode" => {
100                            format_code = Some(String::from_utf8_lossy(&a.value).into_owned());
101                        }
102                        _ => {}
103                    });
104
105                    if let (Some(id), Some(code)) = (num_fmt_id, format_code) {
106                        num_fmts.insert(id, code);
107                    }
108                    xml.read_to_end_into(e.name(), &mut Vec::new())?;
109                }
110                b"cellXfs" => in_cell_xfs = true,
111                b"xf" if in_cell_xfs => {
112                    styles.push(process_xf(e.attributes(), &num_fmts)?);
113                    xml.read_to_end_into(e.name(), &mut Vec::new())?;
114                }
115                _ => {}
116            },
117            Ok(Event::Empty(e)) => match e.name().as_ref() {
118                b"numFmt" => {
119                    let mut num_fmt_id = None;
120                    let mut format_code = None;
121
122                    e.attributes().flatten().for_each(|a| match a.key.as_ref() {
123                        b"numFmtId" => {
124                            num_fmt_id = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
125                        }
126                        b"formatCode" => {
127                            format_code = Some(String::from_utf8_lossy(&a.value).into_owned());
128                        }
129                        _ => {}
130                    });
131
132                    if let (Some(id), Some(code)) = (num_fmt_id, format_code) {
133                        num_fmts.insert(id, code);
134                    }
135                }
136                b"xf" if in_cell_xfs => {
137                    styles.push(process_xf(e.attributes(), &num_fmts)?);
138                }
139                _ => {}
140            },
141            Ok(Event::End(e)) => {
142                if e.name().as_ref() == b"cellXfs" {
143                    in_cell_xfs = false;
144                }
145            }
146            Ok(Event::Eof) => break,
147            Err(e) => return Err(anyhow::anyhow!("XML error in styles: {}", e)),
148            _ => {}
149        }
150        buf.clear();
151    }
152    Ok(styles)
153}
154
155fn tag_eq_ignore_case(actual: &[u8], expect: &str) -> bool {
156    actual.eq_ignore_ascii_case(expect.as_bytes())
157        || actual.ends_with(expect.as_bytes())
158        || actual.ends_with(expect.to_ascii_lowercase().as_bytes())
159        || actual.ends_with(expect.to_ascii_uppercase().as_bytes())
160}
161
162/// Parse the workbook rels to make sure to find what sheet matches what data and the cell matching
163/// per row and sheet.
164pub fn parse_workbook_rels<R: BufRead>(reader: R) -> Result<BTreeMap<String, String>> {
165    // Map r:Id -> full path inside zip (xl/worksheets/sheet1.xml)
166    let mut xml = Reader::from_reader(reader);
167    // xml.config_mut().trim_text(true);
168    let mut buf = Vec::new();
169    let mut map = BTreeMap::new();
170    loop {
171        match xml.read_event_into(&mut buf) {
172            Ok(Event::Empty(e)) | Ok(Event::Start(e)) => {
173                if tag_eq_ignore_case(e.name().as_ref(), "Relationship") {
174                    let mut id = None;
175                    let mut target = None;
176
177                    e.attributes().flatten().for_each(|a| match a.key.as_ref() {
178                        b"Id" | b"r:Id" => {
179                            id = Some(String::from_utf8_lossy(&a.value).into_owned())
180                        }
181                        b"Target" => target = Some(String::from_utf8_lossy(&a.value).into_owned()),
182                        _ => {}
183                    });
184
185                    if let (Some(id), Some(target)) = (id, target) {
186                        map.insert(id, format!("xl/{}", target.trim_start_matches('/')));
187                    }
188                }
189            }
190            Ok(Event::Eof) => break,
191            Err(e) => return Err(anyhow::anyhow!("XML error in workbook.rels: {}", e)),
192            _ => {}
193        }
194        buf.clear();
195    }
196    Ok(map)
197}
198
199/// Parse the workbook itself
200/// Returns a vector of SheetInfo and a boolean indicating if the 1904 date system is used
201pub fn parse_workbook<R: BufRead>(
202    reader: R,
203    rels: &BTreeMap<String, String>,
204) -> Result<(Vec<SheetInfo>, bool)> {
205    let mut xml = Reader::from_reader(reader);
206    // xml.config_mut().trim_text(true);
207    let mut buf = Vec::new();
208    let mut sheets = Vec::new();
209    let mut is_1904 = false;
210    loop {
211        match xml.read_event_into(&mut buf) {
212            Ok(Event::Empty(e)) | Ok(Event::Start(e)) => match e.name().as_ref() {
213                b"sheet" => {
214                    let mut name = None;
215                    let mut r_id = None;
216
217                    e.attributes().flatten().for_each(|a| match a.key.as_ref() {
218                        b"name" => name = Some(String::from_utf8_lossy(&a.value).into_owned()),
219                        b"id" | b"r:id" => {
220                            r_id = Some(String::from_utf8_lossy(&a.value).into_owned())
221                        }
222                        _ => {}
223                    });
224
225                    if let (Some(name), Some(rid)) = (name, r_id) {
226                        if let Some(target) = rels.get(&rid) {
227                            sheets.push(SheetInfo {
228                                name,
229                                path_in_zip: target.clone(),
230                            });
231                        }
232                    }
233                }
234                b"workbookPr" => {
235                    e.attributes().flatten().into_iter().for_each(|a| {
236                        if a.key.as_ref() == b"date1904" {
237                            if let Ok(val) = a.decode_and_unescape_value(&xml) {
238                                is_1904 = val == "1" || val == "true";
239                            }
240                        }
241                    });
242                }
243                _ => {}
244            },
245            Ok(Event::Eof) => break,
246            Err(e) => return Err(anyhow::anyhow!("XML error in workbook.xml: {}", e)),
247            _ => {}
248        }
249        buf.clear();
250    }
251    Ok((sheets, is_1904))
252}
253
254/// Read the shared strings from the excel file
255/// Returns a vector of strings
256pub fn read_shared_strings<R: BufRead>(reader: R) -> Result<Vec<String>> {
257    let mut xml = Reader::from_reader(reader);
258    // xml.config_mut().trim_text(true);
259    let mut buf = Vec::new();
260    let mut strings = Vec::new();
261    let mut in_si = false;
262    let mut current = String::new();
263    loop {
264        match xml.read_event_into(&mut buf) {
265            Ok(Event::Start(e)) => {
266                if tag_eq_ignore_case(e.name().as_ref(), "si") {
267                    in_si = true;
268                    current.clear();
269                }
270            }
271            Ok(Event::End(e)) => {
272                if tag_eq_ignore_case(e.name().as_ref(), "si") {
273                    strings.push(current.clone());
274                    in_si = false;
275                }
276            }
277            Ok(Event::Text(t)) => {
278                if in_si {
279                    // Due to quick-xml 0.38.3 (i assume 0.37+)
280                    // The config is unescaping everything way too early.
281                    // So we have reverted to 0.31.0 to have a functioning parser
282                    // to show correct characters like angle brackets.
283                    current.push_str(&t.unescape()?);
284                }
285            }
286            Ok(Event::Eof) => break,
287            Err(e) => return Err(anyhow::anyhow!("XML error in sharedStrings: {}", e)),
288            _ => {}
289        }
290        buf.clear();
291    }
292    Ok(strings)
293}
294
295/// A cell reference in the form of column and row index
296#[derive(Debug, Clone, Copy, PartialEq)]
297pub struct CellRef {
298    pub col: u32,
299    pub row: u32,
300}
301
302/// Convert a column string (e.g., "A", "AB") to a 1-based index
303/// Examples:
304///   "A" -> 1
305///   "Z" -> 26
306///   "AA" -> 27
307///   "AB" -> 28
308pub fn col_to_index(col: &str) -> u32 {
309    let mut n: u32 = 0;
310
311    col.bytes().into_iter().for_each(|b| {
312        if !(b'A'..=b'Z').contains(&b) {
313            return;
314        }
315        n = n * 26 + ((b - b'A' + 1) as u32);
316    });
317
318    n
319}
320
321/// Parse a cell reference string (e.g., "A1", "BC23") into a CellRef struct
322/// Returns None if the input is invalid
323pub fn parse_cell_ref(s: &str) -> Option<CellRef> {
324    let mut col = String::new();
325    let mut row = String::new();
326
327    s.chars().into_iter().for_each(|c| {
328        if c.is_ascii_alphabetic() {
329            col.push(c.to_ascii_uppercase());
330        } else {
331            row.push(c);
332        }
333    });
334
335    if col.is_empty() || row.is_empty() {
336        return None;
337    }
338
339    Some(CellRef {
340        col: col_to_index(&col),
341        row: row.parse().ok()?,
342    })
343}
344
345/// Convert a sheet name to a lowercase filename-safe string
346/// Non-alphanumeric characters are replaced with underscores.
347/// If the resulting string is empty, "sheet" is returned.
348/// Examples:
349///   "Sheet1" -> "sheet1"
350///   "Data-Set_2024" -> "data-set_2024"
351///   "!!!" -> "sheet"
352///   "Sales Data (Q1)" -> "sales_data__q1_"
353pub fn to_lowercase_filename(name: &str) -> String {
354    let s: String = name
355        .chars()
356        .map(|c| {
357            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
358                c.to_ascii_lowercase()
359            } else {
360                '_'
361            }
362        })
363        .collect();
364
365    if s.is_empty() { "sheet".to_string() } else { s }
366}
367
368// Excel date/time utilities
369// Excel stores dates as serial numbers: days since 1900-01-01 (with 1900 incorrectly treated as leap year)
370static SECONDS_PER_DAY: f64 = 86400.0;
371
372/// Convert an Excel serial date to an ISO 8601 date string (UTC)
373/// If is_1904 is true, use the 1904 date system; otherwise, use the 1900 date system.
374/// Returns None if the serial number is invalid.
375/// Examples:
376///   excel_serial_to_iso_date(44197.0, false) -> Some("2021-01-01T00:00:00.000Z")
377///   excel_serial_to_iso_date(0.0, false) -> Some("1899-12-30T00:00:00.000Z")
378///   excel_serial_to_iso_date(1.0, false) -> Some("1899-12-31T00:00:00.000Z")
379///   excel_serial_to_iso_date(60.0, false) -> Some("1900-02-29T00:00:00.000Z") // Excel bug
380pub fn excel_serial_to_iso_date(serial: f64, is_1904: bool) -> Option<String> {
381    let excel_epoch_days = if is_1904 {
382        24107 // Days from 1970-01-01 to 1904-01-01
383    } else {
384        25569 // Days from 1970-01-01 to 1900-01-01
385    };
386
387    let days = serial.floor() as i32;
388    let time_fraction = serial - days as f64;
389
390    // In the 1900 system, Excel incorrectly treats 1900 as a leap year.
391    // This means any date after Feb 28, 1900, is off by one.
392    // Serial numbers 1-59 are unaffected. 60 is "Feb 29, 1900". 61 is Mar 1, 1900.
393    // Our epoch calculation for 1900 already accounts for this by starting from 1899-12-30,
394    // so we don't need a special adjustment here if using a correct epoch day count.
395    // The constant 25569 = days between 1970-01-01 and 1899-12-30.
396
397    let unix_days = days - excel_epoch_days;
398    let unix_seconds =
399        (unix_days as f64 * SECONDS_PER_DAY) + (time_fraction * SECONDS_PER_DAY).round();
400
401    let datetime = chrono::DateTime::from_timestamp(unix_seconds as i64, 0)?;
402    Some(datetime.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string())
403}
404
405/// Export a sheet XML to CSV file
406/// reader: BufRead of the sheet XML
407/// shared_strings: slice of shared strings
408/// styles: slice of StyleInfo
409/// is_1904: whether the workbook uses the 1904 date system
410/// out_path: path to output CSV file
411/// delimiter: CSV delimiter character (e.g., b',' or b';')
412/// Returns Result<()>
413pub fn export_sheet_xml_to_csv<R: BufRead>(
414    reader: R,
415    shared_strings: &[String],
416    styles: &[StyleInfo],
417    is_1904: bool,
418    out_path: &Path,
419    delimiter: u8,
420) -> Result<()> {
421    let mut xml = Reader::from_reader(reader);
422    let mut buf = Vec::new();
423    let mut wtr = csv::WriterBuilder::new()
424        .flexible(true)
425        .delimiter(delimiter)
426        .from_path(out_path)?;
427
428    let mut num_columns: Option<usize> = None;
429    let mut current_row_idx: u32 = 0;
430    let mut row_vals: Vec<String> = Vec::new();
431    let mut cell_col: Option<u32> = None;
432    let mut cell_type: Option<String> = None;
433    let mut cell_style_idx: Option<u32> = None;
434    let mut cell_val: String = String::new();
435
436    loop {
437        match xml.read_event_into(&mut buf) {
438            Ok(Event::Start(e)) => {
439                if tag_eq_ignore_case(e.name().as_ref(), "row") {
440                    let mut r_attr = None;
441
442                    e.attributes().flatten().into_iter().for_each(|a| {
443                        if a.key.as_ref() == b"r" {
444                            r_attr = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
445                        }
446                    });
447
448                    let next = r_attr.unwrap_or(current_row_idx + 1);
449                    while current_row_idx + 1 < next {
450                        wtr.write_record(std::iter::empty::<String>())?;
451                        current_row_idx += 1;
452                    }
453                    current_row_idx = next;
454                    row_vals.clear();
455                } else if tag_eq_ignore_case(e.name().as_ref(), "c") {
456                    cell_col = None;
457                    cell_type = None;
458                    cell_val.clear();
459                    cell_style_idx = None;
460                    let mut r_attr: Option<CellRef> = None;
461
462                    e.attributes()
463                        .flatten()
464                        .into_iter()
465                        .for_each(|a| match a.key.as_ref() {
466                            b"r" => {
467                                r_attr = parse_cell_ref(&String::from_utf8_lossy(&a.value));
468                            }
469                            b"t" => {
470                                cell_type = Some(String::from_utf8_lossy(&a.value).into_owned())
471                            }
472                            b"s" => {
473                                cell_style_idx =
474                                    String::from_utf8_lossy(&a.value).parse::<u32>().ok();
475                            }
476                            _ => {}
477                        });
478
479                    if let Some(cr) = r_attr {
480                        cell_col = Some(cr.col);
481                    }
482                } else if tag_eq_ignore_case(e.name().as_ref(), "is") {
483                    cell_val.clear();
484                } else if tag_eq_ignore_case(e.name().as_ref(), "t") {
485                    // text will come in Text event
486                }
487            }
488            Ok(Event::End(e)) => {
489                if tag_eq_ignore_case(e.name().as_ref(), "c") {
490                    let col = cell_col.unwrap_or((row_vals.len() as u32) + 1);
491                    let needed = col as usize;
492                    if row_vals.len() < needed {
493                        row_vals.resize(needed, String::new());
494                    }
495
496                    let v = match cell_type.as_deref() {
497                        Some("s") => {
498                            if let Ok(idx) = cell_val.trim().parse::<usize>() {
499                                shared_strings.get(idx).cloned().unwrap_or_default()
500                            } else {
501                                String::new()
502                            }
503                        }
504                        Some("b") => if cell_val.trim() == "1" {
505                            "TRUE"
506                        } else {
507                            "FALSE"
508                        }
509                        .to_string(),
510                        Some("inlineStr") | Some("str") => cell_val.clone(),
511                        Some("e") => {
512                            format!("#ERROR:{}", cell_val)
513                        }
514                        _ => {
515                            // Numeric value
516                            match cell_val.trim().parse::<f64>() {
517                                Ok(num) => {
518                                    let is_date_style = cell_style_idx
519                                        .and_then(|idx| styles.get(idx as usize))
520                                        .is_some_and(|style_info| style_info.is_date);
521
522                                    if is_date_style {
523                                        excel_serial_to_iso_date(num, is_1904)
524                                            .unwrap_or_else(|| cell_val.clone())
525                                    } else {
526                                        cell_val.clone()
527                                    }
528                                }
529                                Err(_) => cell_val.clone(),
530                            }
531                        }
532                    };
533                    row_vals[(col as usize) - 1] = v;
534
535                    cell_col = None;
536                    cell_type = None;
537                    cell_val.clear();
538                    cell_style_idx = None;
539                } else if tag_eq_ignore_case(e.name().as_ref(), "row") {
540                    if num_columns.is_none() {
541                        let last_non_empty = row_vals.iter().rposition(|c| !c.is_empty());
542                        num_columns = Some(last_non_empty.map_or(0, |i| i + 1));
543                    }
544                    if let Some(n) = num_columns {
545                        if row_vals.len() < n {
546                            row_vals.resize(n, String::new());
547                        }
548                    }
549                    wtr.write_record(row_vals.iter())?;
550                    row_vals.clear();
551                }
552            }
553            Ok(Event::Text(t)) => {
554                let txt = t.unescape()?;
555                if !txt.is_empty() {
556                    cell_val.push_str(&txt);
557                }
558            }
559            Ok(Event::Eof) => break,
560            Err(e) => return Err(anyhow::anyhow!("XML error in worksheet: {}", e)),
561            _ => {}
562        }
563        buf.clear();
564    }
565    if !row_vals.is_empty() {
566        wtr.write_record(row_vals.iter())?;
567    }
568    wtr.flush()?;
569    Ok(())
570}
571
572#[cfg(test)]
573mod tests {
574    use super::*;
575    use std::fs;
576    use std::io::BufReader;
577    use tempfile::NamedTempFile;
578
579    #[test]
580    fn test_geo_coordinate_parsing_from_xml() {
581        let xml_data = r#"
582        <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
583            <sheetData>
584                <row r="1">
585                    <c r="A1" t="s"><v>0</v></c>
586                    <c r="B1" t="s"><v>1</v></c>
587                </row>
588                <row r="2">
589                    <c r="A2"><v>10.123</v></c>
590                    <c r="B2"><v>-20.456</v></c>
591                </row>
592            </sheetData>
593        </worksheet>
594        "#;
595        let shared_strings = vec![
596            "origin_latitude".to_string(),
597            "origin_longitude".to_string(),
598        ];
599        let reader = BufReader::new(xml_data.as_bytes());
600        let temp_file = NamedTempFile::new().unwrap();
601        let out_path = temp_file.path();
602
603        export_sheet_xml_to_csv(reader, &shared_strings, &[], false, out_path, b',').unwrap();
604
605        let csv_content = fs::read_to_string(out_path).unwrap();
606        let expected_content = "origin_latitude,origin_longitude\n10.123,-20.456\n";
607        assert_eq!(csv_content, expected_content);
608    }
609}