use std::io::{Cursor, Read};
use std::sync::LazyLock;
use calamine::{Data, Reader, open_workbook_auto_from_rs};
use quick_xml::Reader as XmlReader;
use quick_xml::events::Event;
use zip::ZipArchive;
use crate::document::{DocumentDegradation, DocumentFailureMode, DocumentUnitCount};
use crate::ingest::single_line;
use super::*;
pub(crate) const MAX_SHEETS: usize = 8;
pub(crate) const MAX_ROWS_PER_SHEET: usize = 64;
pub(crate) const MAX_COLUMNS_PER_SHEET: usize = 16;
pub(crate) const MAX_SLIDES: usize = 200;
pub(crate) const MAX_ENTRY_BYTES: u64 = 5 * 1024 * 1024;
const ZIP_ENTRY_READ_CHUNK_BYTES: usize = 8 * 1024;
const OFFICE_MAX_ENTRY_BYTES_ENV: &str = "OFFICE_MAX_ENTRY_BYTES";
const OFFICE_MAX_SLIDES_ENV: &str = "OFFICE_MAX_SLIDES";
const OFFICE_MAX_ROWS_PER_SHEET_ENV: &str = "OFFICE_MAX_ROWS_PER_SHEET";
const OFFICE_MAX_COLUMNS_PER_SHEET_ENV: &str = "OFFICE_MAX_COLUMNS_PER_SHEET";
static OFFICE_MAX_ENTRY_BYTES: LazyLock<u64> =
LazyLock::new(|| office_env_u64(OFFICE_MAX_ENTRY_BYTES_ENV, MAX_ENTRY_BYTES));
static OFFICE_MAX_SLIDES: LazyLock<usize> =
LazyLock::new(|| office_env_usize(OFFICE_MAX_SLIDES_ENV, MAX_SLIDES));
static OFFICE_MAX_ROWS_PER_SHEET: LazyLock<usize> =
LazyLock::new(|| office_env_usize(OFFICE_MAX_ROWS_PER_SHEET_ENV, MAX_ROWS_PER_SHEET));
static OFFICE_MAX_COLUMNS_PER_SHEET: LazyLock<usize> =
LazyLock::new(|| office_env_usize(OFFICE_MAX_COLUMNS_PER_SHEET_ENV, MAX_COLUMNS_PER_SHEET));
pub(crate) fn extract_office_document(
file_name: &str,
bytes: &[u8],
) -> Result<DocumentExtraction, WikiError> {
match extension(file_name).as_deref() {
Some("docx") => extract_docx(bytes),
Some("pptx") => extract_pptx(bytes),
Some("xlsx" | "xls" | "ods") => extract_spreadsheet(bytes),
Some(extension) => Err(document_error(format!(
"unsupported office extension .{extension}"
))),
None => Err(document_error("missing office extension")),
}
}
fn office_max_entry_bytes() -> u64 {
*OFFICE_MAX_ENTRY_BYTES
}
fn office_max_slides() -> usize {
*OFFICE_MAX_SLIDES
}
fn office_max_rows_per_sheet() -> usize {
*OFFICE_MAX_ROWS_PER_SHEET
}
fn office_max_columns_per_sheet() -> usize {
*OFFICE_MAX_COLUMNS_PER_SHEET
}
fn office_env_u64(name: &str, default: u64) -> u64 {
let Some(raw) = std::env::var(name).ok() else {
return default;
};
match raw.trim().parse::<u64>() {
Ok(value) if value > 0 => value,
_ => {
log::warn!("invalid {name}={raw:?}; using default {default}");
default
}
}
}
fn office_env_usize(name: &str, default: usize) -> usize {
let Some(raw) = std::env::var(name).ok() else {
return default;
};
match raw.trim().parse::<usize>() {
Ok(value) if value > 0 => value,
_ => {
log::warn!("invalid {name}={raw:?}; using default {default}");
default
}
}
}
pub(crate) fn extract_docx(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
let xml = read_zip_entry(bytes, "word/document.xml")?;
let paragraphs = extract_xml_paragraphs(&xml)?;
if paragraphs.is_empty() {
return Err(document_error("docx contained no paragraph text"));
}
Ok(DocumentExtraction {
title: paragraphs.first().cloned(),
markdown: paragraphs.join("\n\n"),
units_label: "paragraph_count",
units_count: paragraphs.len(),
degradation: None,
})
}
pub(crate) fn extract_pptx(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
let mut archive = zip_archive(bytes)?;
let mut slide_names = archive
.file_names()
.filter(|name| name.starts_with("ppt/slides/slide") && name.ends_with(".xml"))
.map(str::to_string)
.collect::<Vec<_>>();
slide_names.sort_by_key(|name| slide_number(name).unwrap_or(usize::MAX));
if slide_names.is_empty() {
return Err(document_error("pptx contained no slide XML"));
}
let max_slides = office_max_slides();
let slides_truncated = slide_names.len() > max_slides;
if slides_truncated {
log::warn!(
"pptx extraction truncated slides: {} slides exceed {OFFICE_MAX_SLIDES_ENV}={max_slides}",
slide_names.len()
);
slide_names.truncate(max_slides);
}
let mut markdown = String::new();
let mut title = None;
let mut slide_count = 0;
for (index, name) in slide_names.iter().enumerate() {
let xml = read_zip_entry_from_archive(&mut archive, name)?;
let paragraphs = extract_xml_paragraphs(&xml)?;
if paragraphs.is_empty() {
continue;
}
slide_count += 1;
if title.is_none() {
title = paragraphs.first().cloned();
}
if !markdown.is_empty() {
markdown.push('\n');
}
markdown.push_str("## Slide ");
markdown.push_str(&(index + 1).to_string());
markdown.push_str("\n\n");
markdown.push_str(¶graphs.join("\n\n"));
markdown.push_str("\n\n");
}
if markdown.trim().is_empty() {
return Err(document_error("pptx contained no slide text"));
}
if slides_truncated {
markdown.push_str("_Slides truncated for bounded extraction._\n\n");
}
let degradation = slides_truncated.then(|| {
DocumentDegradation::new(
DocumentFailureMode::OfficeBoundedExtraction,
DocumentUnitCount::slides(slide_count),
format!(
"rendered {slide_count} slides from a deck exceeding {OFFICE_MAX_SLIDES_ENV}={max_slides}"
),
)
});
Ok(DocumentExtraction {
title,
markdown: markdown.trim_end().to_string(),
units_label: "slide_count",
units_count: slide_count,
degradation,
})
}
fn extract_spreadsheet(bytes: &[u8]) -> Result<DocumentExtraction, WikiError> {
let cursor = Cursor::new(bytes);
let mut workbook = open_workbook_auto_from_rs(cursor)
.map_err(|error| document_error(format!("open spreadsheet: {error}")))?;
let sheet_names = workbook.sheet_names().to_vec();
if sheet_names.is_empty() {
return Err(document_error("spreadsheet contained no sheets"));
}
let mut markdown = String::new();
let mut rendered_sheets = 0;
let sheets_truncated = sheet_names.len() > MAX_SHEETS;
if sheets_truncated {
log::warn!(
"spreadsheet extraction truncated sheets: {} sheets exceed MAX_SHEETS={MAX_SHEETS}",
sheet_names.len()
);
}
let mut spreadsheet_truncated = sheets_truncated;
let mut title = None;
let max_rows_per_sheet = office_max_rows_per_sheet();
let max_columns_per_sheet = office_max_columns_per_sheet();
for sheet_name in sheet_names.iter().take(MAX_SHEETS) {
let range = workbook
.worksheet_range(sheet_name)
.map_err(|error| document_error(format!("read sheet {sheet_name}: {error}")))?;
let rows = range
.rows()
.take(max_rows_per_sheet)
.filter_map(|row| spreadsheet_row_text(row, max_columns_per_sheet))
.collect::<Vec<_>>();
if rows.is_empty() {
continue;
}
if title.is_none() {
title = Some(markdown_title(sheet_name));
}
rendered_sheets += 1;
if !markdown.is_empty() {
markdown.push('\n');
}
markdown.push_str("## Sheet: ");
markdown.push_str(&single_line(sheet_name));
markdown.push_str("\n\n");
markdown.push_str(&markdown_table(&rows));
markdown.push('\n');
if range.height() > max_rows_per_sheet || range.width() > max_columns_per_sheet {
spreadsheet_truncated = true;
log::warn!(
"spreadsheet extraction truncated sheet `{}`: {} rows x {} columns exceed {}={} or {}={}",
sheet_name,
range.height(),
range.width(),
OFFICE_MAX_ROWS_PER_SHEET_ENV,
max_rows_per_sheet,
OFFICE_MAX_COLUMNS_PER_SHEET_ENV,
max_columns_per_sheet
);
markdown.push_str("\n_Table truncated for bounded extraction._\n");
}
}
if markdown.trim().is_empty() {
return Err(document_error("spreadsheet contained no cell text"));
}
if sheets_truncated {
markdown.push_str("\n_Sheets truncated for bounded extraction._\n");
}
Ok(DocumentExtraction {
title,
markdown: markdown.trim_end().to_string(),
units_label: "sheet_count",
units_count: rendered_sheets,
degradation: spreadsheet_truncated.then(|| {
DocumentDegradation::new(
DocumentFailureMode::OfficeBoundedExtraction,
DocumentUnitCount::sheets(rendered_sheets),
format!(
"rendered {rendered_sheets} sheets within MAX_SHEETS={MAX_SHEETS}, \
{OFFICE_MAX_ROWS_PER_SHEET_ENV}={max_rows_per_sheet}, and \
{OFFICE_MAX_COLUMNS_PER_SHEET_ENV}={max_columns_per_sheet}"
),
)
}),
})
}
fn read_zip_entry(bytes: &[u8], name: &str) -> Result<String, WikiError> {
let mut archive = zip_archive(bytes)?;
read_zip_entry_from_archive(&mut archive, name)
}
fn read_zip_entry_from_archive(
archive: &mut ZipArchive<Cursor<&[u8]>>,
name: &str,
) -> Result<String, WikiError> {
let mut entry = archive
.by_name(name)
.map_err(|error| document_error(format!("read {name}: {error}")))?;
let max_entry_bytes = office_max_entry_bytes();
if entry.size() > max_entry_bytes {
log::warn!(
"office ZIP entry `{name}` is {} bytes and exceeds {OFFICE_MAX_ENTRY_BYTES_ENV}={max_entry_bytes}",
entry.size()
);
return Err(document_error(format!(
"{name} is {} bytes; maximum supported XML entry is {max_entry_bytes} bytes",
entry.size()
)));
}
let mut bytes = Vec::new();
let mut total = 0_u64;
let mut buffer = [0_u8; ZIP_ENTRY_READ_CHUNK_BYTES];
loop {
let read = entry
.read(&mut buffer)
.map_err(|error| document_error(format!("read {name}: {error}")))?;
if read == 0 {
break;
}
total = total.saturating_add(read as u64);
if total > max_entry_bytes {
log::warn!(
"office ZIP entry `{name}` exceeded {OFFICE_MAX_ENTRY_BYTES_ENV}={max_entry_bytes} while reading"
);
return Err(document_error(format!(
"{name} exceeds maximum supported XML entry size of {max_entry_bytes} bytes"
)));
}
bytes.extend_from_slice(&buffer[..read]);
}
String::from_utf8(bytes).map_err(|error| document_error(format!("decode {name}: {error}")))
}
fn zip_archive(bytes: &[u8]) -> Result<ZipArchive<Cursor<&[u8]>>, WikiError> {
ZipArchive::new(Cursor::new(bytes))
.map_err(|error| document_error(format!("open zip: {error}")))
}
fn extract_xml_paragraphs(xml: &str) -> Result<Vec<String>, WikiError> {
let mut reader = XmlReader::from_str(xml);
reader.config_mut().trim_text(true);
let mut paragraphs = Vec::new();
let mut current = String::new();
let mut in_paragraph = false;
let mut in_text = false;
let mut paragraph_saw_text_element = false;
let mut paragraph_saw_ignored_text = false;
let mut paragraph_saw_unknown_xml = false;
loop {
match reader.read_event() {
Ok(Event::Start(event)) => match local_name(event.name().as_ref()) {
b"p" => {
current.clear();
in_paragraph = true;
paragraph_saw_text_element = false;
paragraph_saw_ignored_text = false;
paragraph_saw_unknown_xml = false;
}
b"t" => {
in_text = true;
if in_paragraph {
paragraph_saw_text_element = true;
}
}
_ => {
if in_paragraph {
paragraph_saw_unknown_xml = true;
}
}
},
Ok(Event::End(event)) => match local_name(event.name().as_ref()) {
b"p" => {
let paragraph = single_line(&decode_xml_entities(¤t));
if !paragraph.is_empty() {
paragraphs.push(paragraph);
} else {
warn_empty_office_paragraph(
paragraph_saw_text_element,
paragraph_saw_ignored_text,
paragraph_saw_unknown_xml,
);
}
current.clear();
in_paragraph = false;
}
b"t" => in_text = false,
_ => {}
},
Ok(Event::Text(text)) if in_text => {
current.push_str(&String::from_utf8_lossy(text.as_ref()));
}
Ok(Event::CData(text)) if in_text => {
current.push_str(&String::from_utf8_lossy(text.as_ref()));
}
Ok(Event::Text(text)) => {
warn_ignored_office_text(
text.as_ref(),
in_paragraph,
&mut paragraph_saw_ignored_text,
);
}
Ok(Event::CData(text)) => {
warn_ignored_office_text(
text.as_ref(),
in_paragraph,
&mut paragraph_saw_ignored_text,
);
}
Ok(Event::Eof) => break,
Ok(_) => {}
Err(error) => return Err(document_error(format!("parse xml: {error}"))),
}
}
Ok(paragraphs)
}
fn spreadsheet_row_text(row: &[Data], max_columns: usize) -> Option<Vec<String>> {
let row = row
.iter()
.take(max_columns)
.map(cell_text)
.collect::<Vec<_>>();
row.iter().any(|cell| !cell.is_empty()).then_some(row)
}
fn warn_ignored_office_text(text: &[u8], in_paragraph: bool, saw_ignored_text: &mut bool) {
if !in_paragraph {
return;
}
let text = single_line(&String::from_utf8_lossy(text));
if text.is_empty() {
return;
}
*saw_ignored_text = true;
log::warn!("office XML paragraph text outside <t> element was ignored");
}
fn warn_empty_office_paragraph(
saw_text_element: bool,
saw_ignored_text: bool,
saw_unknown_xml: bool,
) {
if saw_ignored_text {
log::warn!("office XML paragraph contained text outside recognized <t> elements");
} else if saw_text_element {
log::warn!("office XML paragraph was empty after text extraction");
} else if saw_unknown_xml {
log::warn!("office XML paragraph contained no recognized text runs");
} else {
log::warn!("office XML paragraph was empty");
}
}
pub(crate) fn markdown_table(rows: &[Vec<String>]) -> String {
if rows.is_empty() {
return String::new();
}
let column_count = rows.iter().map(Vec::len).max().unwrap_or(1);
let mut markdown = String::new();
let header = &rows[0];
push_table_row(&mut markdown, header, column_count);
let separators = vec!["---".to_string(); column_count];
push_table_row(&mut markdown, &separators, column_count);
for row in rows.iter().skip(1) {
push_table_row(&mut markdown, row, column_count);
}
markdown.trim_end().to_string()
}
fn push_table_row(markdown: &mut String, row: &[String], column_count: usize) {
markdown.push('|');
for index in 0..column_count {
markdown.push(' ');
if let Some(cell) = row.get(index) {
markdown.push_str(&escape_table_cell(cell));
}
markdown.push_str(" |");
}
markdown.push('\n');
}
fn cell_text(cell: &Data) -> String {
match cell {
Data::Empty => String::new(),
_ => single_line(&cell.to_string()),
}
}
fn escape_table_cell(cell: &str) -> String {
single_line(cell).replace('|', "\\|")
}
fn local_name(name: &[u8]) -> &[u8] {
name.iter()
.position(|byte| *byte == b':')
.map_or(name, |index| &name[index + 1..])
}
fn slide_number(name: &str) -> Option<usize> {
name.strip_prefix("ppt/slides/slide")?
.strip_suffix(".xml")?
.parse()
.ok()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn spreadsheet_row_text_filters_fully_empty_rows() {
let empty = [Data::Empty, Data::String(" ".to_string())];
let row = [Data::Empty, Data::String("value".to_string())];
assert_eq!(spreadsheet_row_text(&empty, 8), None);
assert_eq!(
spreadsheet_row_text(&row, 8),
Some(vec![String::new(), "value".to_string()])
);
}
#[test]
fn xml_paragraphs_ignore_text_outside_t_without_api_change() {
let paragraphs = extract_xml_paragraphs("<w:p>ignored<w:r><w:t>kept</w:t></w:r></w:p>")
.expect("parse xml");
assert_eq!(paragraphs, vec!["kept".to_string()]);
}
}