use std::io::Cursor;
use calamine::{Data, Reader, open_workbook_auto_from_rs};
use super::*;
use crate::kb::content_store::atomic::sha256_hex;
pub const XLSX_MIME: &str = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
pub const XLS_MIME: &str = "application/vnd.ms-excel";
pub const ODS_MIME: &str = "application/vnd.oasis.opendocument.spreadsheet";
pub struct SpreadsheetCanonicalizer;
impl SpreadsheetCanonicalizer {
fn cell_to_string(c: &Data) -> String {
match c {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Float(f) => f.to_string(),
Data::Int(i) => i.to_string(),
Data::Bool(b) => b.to_string(),
Data::DateTime(dt) => dt.to_string(),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
Data::Error(e) => format!("{e:?}"),
}
}
}
impl Canonicalizer for SpreadsheetCanonicalizer {
fn source_kind(&self) -> KbSourceKind {
KbSourceKind::Doc
}
fn supports_mime(&self, mime: &str) -> bool {
matches!(mime, XLSX_MIME | XLS_MIME | ODS_MIME)
}
fn canonicalize(&self, input: CanonicalizeInput<'_>) -> Result<Option<CanonicalizedSource>> {
let mut wb = open_workbook_auto_from_rs(Cursor::new(input.bytes))
.map_err(|e| anyhow::anyhow!("not a readable spreadsheet: {e}"))?;
let sheet_names = wb.sheet_names().to_vec();
let mut sections = Vec::new();
let mut total_rows = 0usize;
for name in &sheet_names {
let range = match wb.worksheet_range(name) {
Ok(r) => r,
Err(e) => {
tracing::warn!(sheet = %name, "spreadsheet: skipping unreadable sheet: {e}");
continue;
}
};
let mut lines = Vec::new();
for row in range.rows() {
let mut cells: Vec<String> = row.iter().map(Self::cell_to_string).collect();
while cells.last().is_some_and(|c| c.is_empty()) {
cells.pop();
}
if cells.is_empty() {
continue;
}
lines.push(cells.join(" | "));
}
if lines.is_empty() {
continue;
}
total_rows += lines.len();
sections.push(format!("## {name}\n\n{}", lines.join("\n")));
}
if sections.is_empty() {
return Ok(None);
}
let md = sections.join("\n\n");
let lsid = input
.logical_source_id_seed
.clone()
.unwrap_or_else(|| LogicalSourceId::for_file(&sha256_hex(input.bytes)));
let extra = serde_json::json!({
"n_sheets": sheet_names.len(),
"n_rows": total_rows,
});
Ok(Some(CanonicalizedSource {
markdown: md,
metadata: CanonicalMetadata {
source_kind: KbSourceKind::Doc,
logical_source_id: lsid,
title: input.hint_title.unwrap_or("Untitled.xlsx").to_string(),
mime: input.mime.to_string(),
created_at_ms: chrono::Utc::now().timestamp_millis(),
tags: vec![],
extra,
},
}))
}
}
#[cfg(test)]
mod tests {
use rust_xlsxwriter::Workbook;
use super::*;
fn input<'a>(bytes: &'a [u8], mime: &'a str) -> CanonicalizeInput<'a> {
CanonicalizeInput {
bytes,
mime,
hint_title: Some("book.xlsx"),
logical_source_id_seed: None,
}
}
fn build_xlsx() -> Vec<u8> {
let mut wb = Workbook::new();
let s = wb.add_worksheet();
s.write_string(0, 0, "姓名").unwrap();
s.write_string(0, 1, "分数").unwrap();
s.write_string(1, 0, "Alice").unwrap();
s.write_number(1, 1, 95.0).unwrap();
s.write_string(2, 0, "Bob").unwrap();
s.write_number(2, 1, 88.0).unwrap();
wb.save_to_buffer().unwrap()
}
#[test]
fn xlsx_captures_strings_and_numbers() {
let bytes = build_xlsx();
let out = SpreadsheetCanonicalizer
.canonicalize(input(&bytes, XLSX_MIME))
.unwrap()
.expect("some");
assert!(
out.markdown.contains("姓名 | 分数"),
"got: {}",
out.markdown
);
assert!(out.markdown.contains("Alice | 95"), "got: {}", out.markdown);
assert!(out.markdown.contains("Bob | 88"), "got: {}", out.markdown);
}
#[test]
fn empty_workbook_is_none() {
let mut wb = Workbook::new();
wb.add_worksheet();
let bytes = wb.save_to_buffer().unwrap();
let out = SpreadsheetCanonicalizer
.canonicalize(input(&bytes, XLSX_MIME))
.unwrap();
assert!(out.is_none());
}
#[test]
fn non_spreadsheet_bytes_is_error() {
let r = SpreadsheetCanonicalizer.canonicalize(input(b"not a workbook", XLSX_MIME));
assert!(r.is_err());
}
}