use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek};
use std::path::Path;
use oxiarc_archive::zip::ZipReader;
use quick_xml::escape::unescape;
use quick_xml::events::attributes::Attribute;
use quick_xml::events::{BytesText, Event};
use quick_xml::Reader;
use crate::error::Result;
use super::cell::{parse_ref, XlsxCellValue};
use super::error::{invalid, io_err, std_io, xml_err, zip_err};
#[derive(Debug, Clone)]
pub(super) struct WorkbookManifest {
pub(super) sheet_names: Vec<String>,
pub(super) sheet_rids: Vec<String>,
}
pub(super) struct LoadedWorkbook {
pub(super) sheet_names: Vec<String>,
pub(super) sheets: Vec<LoadedSheet>,
pub(super) shared_strings: Vec<String>,
}
#[derive(Debug, Clone)]
pub(super) struct LoadedSheet {
pub(super) name: String,
pub(super) rows: Vec<Vec<XlsxCellValue>>,
pub(super) cols: usize,
}
struct RawSheet {
path: String,
rid: String,
bytes: Vec<u8>,
}
pub(super) fn load_workbook<P: AsRef<Path>>(path: P) -> Result<LoadedWorkbook> {
let file = File::open(path.as_ref()).map_err(std_io)?;
let buf = BufReader::new(file);
load_workbook_from_reader(buf)
}
pub(super) fn load_workbook_from_reader<R: Read + Seek>(reader: R) -> Result<LoadedWorkbook> {
let mut zr = ZipReader::new(reader).map_err(zip_err)?;
let entries: Vec<_> = zr.entries().to_vec();
let mut workbook_bytes: Option<Vec<u8>> = None;
let mut workbook_rels_bytes: Option<Vec<u8>> = None;
let mut shared_bytes: Option<Vec<u8>> = None;
let mut raw_sheets: Vec<RawSheet> = Vec::new();
for entry in &entries {
let name = entry.name.as_str();
if name == "xl/workbook.xml" {
workbook_bytes = Some(zr.extract(entry).map_err(zip_err)?);
} else if name == "xl/_rels/workbook.xml.rels" {
workbook_rels_bytes = Some(zr.extract(entry).map_err(zip_err)?);
} else if name == "xl/sharedStrings.xml" {
shared_bytes = Some(zr.extract(entry).map_err(zip_err)?);
} else if name.starts_with("xl/worksheets/") && name.ends_with(".xml") {
let bytes = zr.extract(entry).map_err(zip_err)?;
raw_sheets.push(RawSheet {
path: name.to_string(),
rid: String::new(), bytes,
});
}
}
let workbook_bytes =
workbook_bytes.ok_or_else(|| io_err("xlsx: workbook.xml missing from archive"))?;
let manifest = parse_workbook_manifest(&workbook_bytes)?;
if let Some(rels) = workbook_rels_bytes.as_deref() {
let rid_to_target = parse_rels(rels)?;
for raw in &mut raw_sheets {
for (rid, target) in &rid_to_target {
let normalized = if target.starts_with('/') {
target[1..].to_string()
} else {
format!("xl/{target}")
};
if normalized == raw.path || target == &raw.path {
raw.rid = rid.clone();
break;
}
}
}
}
let shared_strings = match shared_bytes.as_deref() {
Some(b) => parse_shared_strings(b)?,
None => Vec::new(),
};
let mut sheets: Vec<LoadedSheet> = Vec::with_capacity(manifest.sheet_names.len());
for (i, (name, rid)) in manifest
.sheet_names
.iter()
.zip(manifest.sheet_rids.iter())
.enumerate()
{
let fallback_path = format!("xl/worksheets/sheet{}.xml", i + 1);
let raw = raw_sheets
.iter()
.find(|r| !r.rid.is_empty() && r.rid == *rid)
.or_else(|| raw_sheets.iter().find(|r| r.path == fallback_path))
.ok_or_else(|| io_err(format!("xlsx: sheet '{name}' not found in archive")))?;
let (rows, cols) = parse_worksheet(&raw.bytes, &shared_strings)?;
sheets.push(LoadedSheet {
name: name.clone(),
rows,
cols,
});
}
Ok(LoadedWorkbook {
sheet_names: manifest.sheet_names,
sheets,
shared_strings,
})
}
fn parse_workbook_manifest(bytes: &[u8]) -> Result<WorkbookManifest> {
let mut names = Vec::new();
let mut rids = Vec::new();
let mut reader = Reader::from_reader(Cursor::new(bytes));
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf).map_err(xml_err)? {
Event::Eof => break,
Event::Empty(e) | Event::Start(e) if e.name().as_ref() == b"sheet" => {
let mut name = String::new();
let mut rid = String::new();
for a in e.attributes().with_checks(false).flatten() {
let key = a.key.as_ref();
if key == b"name" {
name = attr_to_string(&a)?;
} else if key == b"r:id" || key.ends_with(b":id") || key == b"id" {
rid = attr_to_string(&a)?;
}
}
if !name.is_empty() {
names.push(name);
rids.push(rid);
}
}
_ => {}
}
buf.clear();
}
if names.is_empty() {
return Err(invalid("xlsx: workbook contains no sheets"));
}
Ok(WorkbookManifest {
sheet_names: names,
sheet_rids: rids,
})
}
fn parse_rels(bytes: &[u8]) -> Result<Vec<(String, String)>> {
let mut out = Vec::new();
let mut reader = Reader::from_reader(Cursor::new(bytes));
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf).map_err(xml_err)? {
Event::Eof => break,
Event::Empty(e) | Event::Start(e) if e.name().as_ref() == b"Relationship" => {
let mut id = String::new();
let mut target = String::new();
for a in e.attributes().with_checks(false).flatten() {
let key = a.key.as_ref();
if key == b"Id" {
id = attr_to_string(&a)?;
} else if key == b"Target" {
target = attr_to_string(&a)?;
}
}
if !id.is_empty() && !target.is_empty() {
out.push((id, target));
}
}
_ => {}
}
buf.clear();
}
Ok(out)
}
fn parse_shared_strings(bytes: &[u8]) -> Result<Vec<String>> {
let mut out: Vec<String> = Vec::new();
let mut reader = Reader::from_reader(Cursor::new(bytes));
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_si = false;
let mut in_t = false;
let mut accum = String::new();
loop {
match reader.read_event_into(&mut buf).map_err(xml_err)? {
Event::Eof => break,
Event::Start(e) => match e.name().as_ref() {
b"si" => {
in_si = true;
accum.clear();
}
b"t" => {
if in_si {
in_t = true;
}
}
_ => {}
},
Event::End(e) => match e.name().as_ref() {
b"si" => {
if in_si {
out.push(std::mem::take(&mut accum));
in_si = false;
}
}
b"t" => {
in_t = false;
}
_ => {}
},
Event::Text(t) => {
if in_si && in_t {
let s = decode_and_unescape(&t)?;
accum.push_str(&s);
}
}
Event::CData(c) => {
if in_si && in_t {
let s = std::str::from_utf8(c.as_ref())
.map_err(|e| io_err(format!("xlsx: invalid UTF-8 in CDATA: {e}")))?;
accum.push_str(s);
}
}
_ => {}
}
buf.clear();
}
Ok(out)
}
fn parse_worksheet(bytes: &[u8], shared: &[String]) -> Result<(Vec<Vec<XlsxCellValue>>, usize)> {
let mut reader = Reader::from_reader(Cursor::new(bytes));
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut cells: Vec<(usize, usize, XlsxCellValue)> = Vec::new();
let mut max_row = 0usize;
let mut max_col = 0usize;
let mut cur_ref: Option<(usize, usize)> = None;
let mut cur_type: CellType = CellType::Number;
let mut in_value = false;
let mut value_text = String::new();
let mut in_inline_text = false;
let mut inline_text = String::new();
loop {
match reader.read_event_into(&mut buf).map_err(xml_err)? {
Event::Eof => break,
Event::Start(e) => {
let name = e.name();
let tag = name.as_ref();
if tag == b"c" {
cur_ref = None;
cur_type = CellType::Number;
value_text.clear();
inline_text.clear();
for a in e.attributes().with_checks(false).flatten() {
let key = a.key.as_ref();
if key == b"r" {
let s = attr_to_string(&a)?;
cur_ref = Some(parse_ref(&s)?);
} else if key == b"t" {
cur_type = CellType::from_attr(a.value.as_ref());
}
}
} else if tag == b"v" {
in_value = true;
value_text.clear();
} else if tag == b"t" {
in_inline_text = true;
inline_text.clear();
}
}
Event::Empty(e) => {
let name = e.name();
let tag = name.as_ref();
if tag == b"c" {
let mut coord: Option<(usize, usize)> = None;
for a in e.attributes().with_checks(false).flatten() {
let key = a.key.as_ref();
if key == b"r" {
let s = attr_to_string(&a)?;
coord = Some(parse_ref(&s)?);
}
}
if let Some((r, c)) = coord {
cells.push((r, c, XlsxCellValue::Empty));
max_row = max_row.max(r + 1);
max_col = max_col.max(c + 1);
}
}
}
Event::End(e) => {
let name = e.name();
let tag = name.as_ref();
if tag == b"v" {
in_value = false;
} else if tag == b"t" {
in_inline_text = false;
} else if tag == b"c" {
let (r, c) = match cur_ref {
Some(v) => v,
None => continue,
};
let val = finalise_cell(&cur_type, &value_text, &inline_text, shared)?;
cells.push((r, c, val));
max_row = max_row.max(r + 1);
max_col = max_col.max(c + 1);
cur_ref = None;
}
}
Event::Text(t) => {
if in_value {
let s = decode_and_unescape(&t)?;
value_text.push_str(&s);
} else if in_inline_text {
let s = decode_and_unescape(&t)?;
inline_text.push_str(&s);
}
}
Event::CData(c) => {
let s = std::str::from_utf8(c.as_ref())
.map_err(|e| io_err(format!("xlsx: invalid UTF-8 in CDATA: {e}")))?;
if in_value {
value_text.push_str(s);
} else if in_inline_text {
inline_text.push_str(s);
}
}
_ => {}
}
buf.clear();
}
let mut rows: Vec<Vec<XlsxCellValue>> = vec![Vec::new(); max_row];
for row in rows.iter_mut() {
row.resize(max_col, XlsxCellValue::Empty);
}
for (r, c, v) in cells {
if r < rows.len() && c < max_col {
rows[r][c] = v;
}
}
Ok((rows, max_col))
}
fn finalise_cell(
kind: &CellType,
v: &str,
inline: &str,
shared: &[String],
) -> Result<XlsxCellValue> {
match kind {
CellType::Number => {
if v.is_empty() {
Ok(XlsxCellValue::Empty)
} else {
match v.parse::<f64>() {
Ok(n) => Ok(XlsxCellValue::Number(n)),
Err(_) => Ok(XlsxCellValue::String(v.to_string())),
}
}
}
CellType::SharedString => {
if v.is_empty() {
Ok(XlsxCellValue::Empty)
} else {
let idx: usize = v
.parse()
.map_err(|_| io_err(format!("xlsx: invalid shared-string index '{v}'")))?;
match shared.get(idx) {
Some(s) => Ok(XlsxCellValue::String(s.clone())),
None => Err(io_err(format!(
"xlsx: shared-string index {idx} out of bounds ({} strings)",
shared.len()
))),
}
}
}
CellType::InlineStr => Ok(XlsxCellValue::String(inline.to_string())),
CellType::Str => Ok(XlsxCellValue::String(v.to_string())),
CellType::Boolean => {
let b = matches!(v, "1" | "TRUE" | "true");
Ok(XlsxCellValue::Boolean(b))
}
CellType::Error => Ok(XlsxCellValue::Error(v.to_string())),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CellType {
Number,
SharedString,
InlineStr,
Str,
Boolean,
Error,
}
impl CellType {
fn from_attr(v: &[u8]) -> Self {
match v {
b"s" => CellType::SharedString,
b"inlineStr" => CellType::InlineStr,
b"str" => CellType::Str,
b"b" => CellType::Boolean,
b"e" => CellType::Error,
_ => CellType::Number,
}
}
}
fn attr_to_string(a: &Attribute<'_>) -> Result<String> {
let cow = a.unescape_value().map_err(xml_err)?;
Ok(cow.into_owned())
}
fn decode_and_unescape(t: &BytesText<'_>) -> Result<String> {
let decoded = t
.decode()
.map_err(|e| io_err(format!("xlsx: text decode failure: {e}")))?;
let unescaped =
unescape(&decoded).map_err(|e| io_err(format!("xlsx: text unescape failure: {e}")))?;
Ok(unescaped.into_owned())
}