1use anyhow::{Context, Result};
2use chrono;
3
4use quick_xml::events::Event;
5use quick_xml::reader::Reader;
6use std::collections::BTreeMap;
7use std::fs::File;
8use std::io::{BufRead, BufReader};
9use std::path::Path;
10use zip::ZipArchive;
11
12#[derive(Debug, Clone)]
16pub struct SheetInfo {
17 pub name: String,
18 pub path_in_zip: String,
19}
20
21#[derive(Debug, Clone, Default)]
23pub struct StyleInfo {
24 pub is_date: bool,
25}
26
27pub fn open_zip(path: &Path) -> Result<ZipArchive<BufReader<File>>> {
33 let file = File::open(path)?;
34
35 let reader = BufReader::new(file);
36 let zip = ZipArchive::new(reader).context("Failed to read XLSX (zip) archive")?;
37 Ok(zip)
38}
39
40pub fn parse_styles<R: BufRead>(reader: R) -> Result<Vec<StyleInfo>> {
43 let mut xml = Reader::from_reader(reader);
44 let mut buf = Vec::new();
45 let mut styles = Vec::new();
46 let mut num_fmts = BTreeMap::new();
47 let mut in_cell_xfs = false;
48
49 let process_xf = |attrs: quick_xml::events::attributes::Attributes,
51 num_fmts: &BTreeMap<u32, String>|
52 -> Result<StyleInfo> {
53 let mut style = StyleInfo::default();
54 let mut num_fmt_id_attr = None;
55 let mut apply_num_fmt = true;
56
57 attrs.flatten().for_each(|a| match a.key.as_ref() {
58 b"numFmtId" => {
59 num_fmt_id_attr = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
60 }
61 b"applyNumberFormat" => {
62 apply_num_fmt = String::from_utf8_lossy(&a.value).parse::<u32>().ok() == Some(1);
63 }
64 _ => {}
65 });
66
67 if apply_num_fmt {
68 if let Some(id) = num_fmt_id_attr {
69 let is_builtin_date =
71 matches!(id, 14..=22 | 27..=36 | 45..=47 | 50..=58 | 67..=71 | 75..=81);
72 if is_builtin_date {
73 style.is_date = true;
74 } else if let Some(format_code) = num_fmts.get(&id) {
75 let lower = format_code.to_lowercase();
77 if (lower.contains('y') || lower.contains('d') || lower.contains('m'))
78 && !lower.contains('#')
79 {
80 style.is_date = true;
81 }
82 }
83 }
84 }
85 Ok(style)
86 };
87
88 loop {
89 match xml.read_event_into(&mut buf) {
90 Ok(Event::Start(e)) => match e.name().as_ref() {
91 b"numFmt" => {
92 let mut num_fmt_id = None;
93 let mut format_code = None;
94
95 e.attributes().flatten().for_each(|a| match a.key.as_ref() {
96 b"numFmtId" => {
97 num_fmt_id = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
98 }
99 b"formatCode" => {
100 format_code = Some(String::from_utf8_lossy(&a.value).into_owned());
101 }
102 _ => {}
103 });
104
105 if let (Some(id), Some(code)) = (num_fmt_id, format_code) {
106 num_fmts.insert(id, code);
107 }
108 xml.read_to_end_into(e.name(), &mut Vec::new())?;
109 }
110 b"cellXfs" => in_cell_xfs = true,
111 b"xf" if in_cell_xfs => {
112 styles.push(process_xf(e.attributes(), &num_fmts)?);
113 xml.read_to_end_into(e.name(), &mut Vec::new())?;
114 }
115 _ => {}
116 },
117 Ok(Event::Empty(e)) => match e.name().as_ref() {
118 b"numFmt" => {
119 let mut num_fmt_id = None;
120 let mut format_code = None;
121
122 e.attributes().flatten().for_each(|a| match a.key.as_ref() {
123 b"numFmtId" => {
124 num_fmt_id = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
125 }
126 b"formatCode" => {
127 format_code = Some(String::from_utf8_lossy(&a.value).into_owned());
128 }
129 _ => {}
130 });
131
132 if let (Some(id), Some(code)) = (num_fmt_id, format_code) {
133 num_fmts.insert(id, code);
134 }
135 }
136 b"xf" if in_cell_xfs => {
137 styles.push(process_xf(e.attributes(), &num_fmts)?);
138 }
139 _ => {}
140 },
141 Ok(Event::End(e)) => {
142 if e.name().as_ref() == b"cellXfs" {
143 in_cell_xfs = false;
144 }
145 }
146 Ok(Event::Eof) => break,
147 Err(e) => return Err(anyhow::anyhow!("XML error in styles: {}", e)),
148 _ => {}
149 }
150 buf.clear();
151 }
152 Ok(styles)
153}
154
155fn tag_eq_ignore_case(actual: &[u8], expect: &str) -> bool {
156 actual.eq_ignore_ascii_case(expect.as_bytes())
157 || actual.ends_with(expect.as_bytes())
158 || actual.ends_with(expect.to_ascii_lowercase().as_bytes())
159 || actual.ends_with(expect.to_ascii_uppercase().as_bytes())
160}
161
162pub fn parse_workbook_rels<R: BufRead>(reader: R) -> Result<BTreeMap<String, String>> {
165 let mut xml = Reader::from_reader(reader);
167 let mut buf = Vec::new();
169 let mut map = BTreeMap::new();
170 loop {
171 match xml.read_event_into(&mut buf) {
172 Ok(Event::Empty(e)) | Ok(Event::Start(e)) => {
173 if tag_eq_ignore_case(e.name().as_ref(), "Relationship") {
174 let mut id = None;
175 let mut target = None;
176
177 e.attributes().flatten().for_each(|a| match a.key.as_ref() {
178 b"Id" | b"r:Id" => {
179 id = Some(String::from_utf8_lossy(&a.value).into_owned())
180 }
181 b"Target" => target = Some(String::from_utf8_lossy(&a.value).into_owned()),
182 _ => {}
183 });
184
185 if let (Some(id), Some(target)) = (id, target) {
186 map.insert(id, format!("xl/{}", target.trim_start_matches('/')));
187 }
188 }
189 }
190 Ok(Event::Eof) => break,
191 Err(e) => return Err(anyhow::anyhow!("XML error in workbook.rels: {}", e)),
192 _ => {}
193 }
194 buf.clear();
195 }
196 Ok(map)
197}
198
199pub fn parse_workbook<R: BufRead>(
202 reader: R,
203 rels: &BTreeMap<String, String>,
204) -> Result<(Vec<SheetInfo>, bool)> {
205 let mut xml = Reader::from_reader(reader);
206 let mut buf = Vec::new();
208 let mut sheets = Vec::new();
209 let mut is_1904 = false;
210 loop {
211 match xml.read_event_into(&mut buf) {
212 Ok(Event::Empty(e)) | Ok(Event::Start(e)) => match e.name().as_ref() {
213 b"sheet" => {
214 let mut name = None;
215 let mut r_id = None;
216
217 e.attributes().flatten().for_each(|a| match a.key.as_ref() {
218 b"name" => name = Some(String::from_utf8_lossy(&a.value).into_owned()),
219 b"id" | b"r:id" => {
220 r_id = Some(String::from_utf8_lossy(&a.value).into_owned())
221 }
222 _ => {}
223 });
224
225 if let (Some(name), Some(rid)) = (name, r_id) {
226 if let Some(target) = rels.get(&rid) {
227 sheets.push(SheetInfo {
228 name,
229 path_in_zip: target.clone(),
230 });
231 }
232 }
233 }
234 b"workbookPr" => {
235 e.attributes().flatten().into_iter().for_each(|a| {
236 if a.key.as_ref() == b"date1904" {
237 if let Ok(val) = a.decode_and_unescape_value(&xml) {
238 is_1904 = val == "1" || val == "true";
239 }
240 }
241 });
242 }
243 _ => {}
244 },
245 Ok(Event::Eof) => break,
246 Err(e) => return Err(anyhow::anyhow!("XML error in workbook.xml: {}", e)),
247 _ => {}
248 }
249 buf.clear();
250 }
251 Ok((sheets, is_1904))
252}
253
254pub fn read_shared_strings<R: BufRead>(reader: R) -> Result<Vec<String>> {
257 let mut xml = Reader::from_reader(reader);
258 let mut buf = Vec::new();
260 let mut strings = Vec::new();
261 let mut in_si = false;
262 let mut current = String::new();
263 loop {
264 match xml.read_event_into(&mut buf) {
265 Ok(Event::Start(e)) => {
266 if tag_eq_ignore_case(e.name().as_ref(), "si") {
267 in_si = true;
268 current.clear();
269 }
270 }
271 Ok(Event::End(e)) => {
272 if tag_eq_ignore_case(e.name().as_ref(), "si") {
273 strings.push(current.clone());
274 in_si = false;
275 }
276 }
277 Ok(Event::Text(t)) => {
278 if in_si {
279 current.push_str(&t.unescape()?);
284 }
285 }
286 Ok(Event::Eof) => break,
287 Err(e) => return Err(anyhow::anyhow!("XML error in sharedStrings: {}", e)),
288 _ => {}
289 }
290 buf.clear();
291 }
292 Ok(strings)
293}
294
295#[derive(Debug, Clone, Copy, PartialEq)]
297pub struct CellRef {
298 pub col: u32,
299 pub row: u32,
300}
301
302pub fn col_to_index(col: &str) -> u32 {
309 let mut n: u32 = 0;
310
311 col.bytes().into_iter().for_each(|b| {
312 if !(b'A'..=b'Z').contains(&b) {
313 return;
314 }
315 n = n * 26 + ((b - b'A' + 1) as u32);
316 });
317
318 n
319}
320
321pub fn parse_cell_ref(s: &str) -> Option<CellRef> {
324 let mut col = String::new();
325 let mut row = String::new();
326
327 s.chars().into_iter().for_each(|c| {
328 if c.is_ascii_alphabetic() {
329 col.push(c.to_ascii_uppercase());
330 } else {
331 row.push(c);
332 }
333 });
334
335 if col.is_empty() || row.is_empty() {
336 return None;
337 }
338
339 Some(CellRef {
340 col: col_to_index(&col),
341 row: row.parse().ok()?,
342 })
343}
344
345pub fn to_lowercase_filename(name: &str) -> String {
354 let s: String = name
355 .chars()
356 .map(|c| {
357 if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
358 c.to_ascii_lowercase()
359 } else {
360 '_'
361 }
362 })
363 .collect();
364
365 if s.is_empty() { "sheet".to_string() } else { s }
366}
367
368static SECONDS_PER_DAY: f64 = 86400.0;
371
372pub fn excel_serial_to_iso_date(serial: f64, is_1904: bool) -> Option<String> {
381 let excel_epoch_days = if is_1904 {
382 24107 } else {
384 25569 };
386
387 let days = serial.floor() as i32;
388 let time_fraction = serial - days as f64;
389
390 let unix_days = days - excel_epoch_days;
398 let unix_seconds =
399 (unix_days as f64 * SECONDS_PER_DAY) + (time_fraction * SECONDS_PER_DAY).round();
400
401 let datetime = chrono::DateTime::from_timestamp(unix_seconds as i64, 0)?;
402 Some(datetime.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string())
403}
404
405pub fn export_sheet_xml_to_csv<R: BufRead>(
414 reader: R,
415 shared_strings: &[String],
416 styles: &[StyleInfo],
417 is_1904: bool,
418 out_path: &Path,
419 delimiter: u8,
420) -> Result<()> {
421 let mut xml = Reader::from_reader(reader);
422 let mut buf = Vec::new();
423 let mut wtr = csv::WriterBuilder::new()
424 .flexible(true)
425 .delimiter(delimiter)
426 .from_path(out_path)?;
427
428 let mut num_columns: Option<usize> = None;
429 let mut current_row_idx: u32 = 0;
430 let mut row_vals: Vec<String> = Vec::new();
431 let mut cell_col: Option<u32> = None;
432 let mut cell_type: Option<String> = None;
433 let mut cell_style_idx: Option<u32> = None;
434 let mut cell_val: String = String::new();
435
436 loop {
437 match xml.read_event_into(&mut buf) {
438 Ok(Event::Start(e)) => {
439 if tag_eq_ignore_case(e.name().as_ref(), "row") {
440 let mut r_attr = None;
441
442 e.attributes().flatten().into_iter().for_each(|a| {
443 if a.key.as_ref() == b"r" {
444 r_attr = String::from_utf8_lossy(&a.value).parse::<u32>().ok();
445 }
446 });
447
448 let next = r_attr.unwrap_or(current_row_idx + 1);
449 while current_row_idx + 1 < next {
450 wtr.write_record(std::iter::empty::<String>())?;
451 current_row_idx += 1;
452 }
453 current_row_idx = next;
454 row_vals.clear();
455 } else if tag_eq_ignore_case(e.name().as_ref(), "c") {
456 cell_col = None;
457 cell_type = None;
458 cell_val.clear();
459 cell_style_idx = None;
460 let mut r_attr: Option<CellRef> = None;
461
462 e.attributes()
463 .flatten()
464 .into_iter()
465 .for_each(|a| match a.key.as_ref() {
466 b"r" => {
467 r_attr = parse_cell_ref(&String::from_utf8_lossy(&a.value));
468 }
469 b"t" => {
470 cell_type = Some(String::from_utf8_lossy(&a.value).into_owned())
471 }
472 b"s" => {
473 cell_style_idx =
474 String::from_utf8_lossy(&a.value).parse::<u32>().ok();
475 }
476 _ => {}
477 });
478
479 if let Some(cr) = r_attr {
480 cell_col = Some(cr.col);
481 }
482 } else if tag_eq_ignore_case(e.name().as_ref(), "is") {
483 cell_val.clear();
484 } else if tag_eq_ignore_case(e.name().as_ref(), "t") {
485 }
487 }
488 Ok(Event::End(e)) => {
489 if tag_eq_ignore_case(e.name().as_ref(), "c") {
490 let col = cell_col.unwrap_or((row_vals.len() as u32) + 1);
491 let needed = col as usize;
492 if row_vals.len() < needed {
493 row_vals.resize(needed, String::new());
494 }
495
496 let v = match cell_type.as_deref() {
497 Some("s") => {
498 if let Ok(idx) = cell_val.trim().parse::<usize>() {
499 shared_strings.get(idx).cloned().unwrap_or_default()
500 } else {
501 String::new()
502 }
503 }
504 Some("b") => if cell_val.trim() == "1" {
505 "TRUE"
506 } else {
507 "FALSE"
508 }
509 .to_string(),
510 Some("inlineStr") | Some("str") => cell_val.clone(),
511 Some("e") => {
512 format!("#ERROR:{}", cell_val)
513 }
514 _ => {
515 match cell_val.trim().parse::<f64>() {
517 Ok(num) => {
518 let is_date_style = cell_style_idx
519 .and_then(|idx| styles.get(idx as usize))
520 .is_some_and(|style_info| style_info.is_date);
521
522 if is_date_style {
523 excel_serial_to_iso_date(num, is_1904)
524 .unwrap_or_else(|| cell_val.clone())
525 } else {
526 cell_val.clone()
527 }
528 }
529 Err(_) => cell_val.clone(),
530 }
531 }
532 };
533 row_vals[(col as usize) - 1] = v;
534
535 cell_col = None;
536 cell_type = None;
537 cell_val.clear();
538 cell_style_idx = None;
539 } else if tag_eq_ignore_case(e.name().as_ref(), "row") {
540 if num_columns.is_none() {
541 let last_non_empty = row_vals.iter().rposition(|c| !c.is_empty());
542 num_columns = Some(last_non_empty.map_or(0, |i| i + 1));
543 }
544 if let Some(n) = num_columns {
545 if row_vals.len() < n {
546 row_vals.resize(n, String::new());
547 }
548 }
549 wtr.write_record(row_vals.iter())?;
550 row_vals.clear();
551 }
552 }
553 Ok(Event::Text(t)) => {
554 let txt = t.unescape()?;
555 if !txt.is_empty() {
556 cell_val.push_str(&txt);
557 }
558 }
559 Ok(Event::Eof) => break,
560 Err(e) => return Err(anyhow::anyhow!("XML error in worksheet: {}", e)),
561 _ => {}
562 }
563 buf.clear();
564 }
565 if !row_vals.is_empty() {
566 wtr.write_record(row_vals.iter())?;
567 }
568 wtr.flush()?;
569 Ok(())
570}
571
572#[cfg(test)]
573mod tests {
574 use super::*;
575 use std::fs;
576 use std::io::BufReader;
577 use tempfile::NamedTempFile;
578
579 #[test]
580 fn test_geo_coordinate_parsing_from_xml() {
581 let xml_data = r#"
582 <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
583 <sheetData>
584 <row r="1">
585 <c r="A1" t="s"><v>0</v></c>
586 <c r="B1" t="s"><v>1</v></c>
587 </row>
588 <row r="2">
589 <c r="A2"><v>10.123</v></c>
590 <c r="B2"><v>-20.456</v></c>
591 </row>
592 </sheetData>
593 </worksheet>
594 "#;
595 let shared_strings = vec![
596 "origin_latitude".to_string(),
597 "origin_longitude".to_string(),
598 ];
599 let reader = BufReader::new(xml_data.as_bytes());
600 let temp_file = NamedTempFile::new().unwrap();
601 let out_path = temp_file.path();
602
603 export_sheet_xml_to_csv(reader, &shared_strings, &[], false, out_path, b',').unwrap();
604
605 let csv_content = fs::read_to_string(out_path).unwrap();
606 let expected_content = "origin_latitude,origin_longitude\n10.123,-20.456\n";
607 assert_eq!(csv_content, expected_content);
608 }
609}