mod docx;
mod pptx;
mod xlsx;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum DocumentType {
Xlsx,
Docx,
Pptx,
}
#[derive(Debug)]
pub(crate) enum DocumentError {
Zip(zip::result::ZipError),
Xml(quick_xml::Error),
XmlAttr(quick_xml::events::attributes::AttrError),
Calamine(calamine::Error),
CalamineXlsx(calamine::XlsxError),
Io(std::io::Error),
}
impl From<zip::result::ZipError> for DocumentError {
fn from(e: zip::result::ZipError) -> Self {
DocumentError::Zip(e)
}
}
impl From<quick_xml::Error> for DocumentError {
fn from(e: quick_xml::Error) -> Self {
DocumentError::Xml(e)
}
}
impl From<quick_xml::events::attributes::AttrError> for DocumentError {
fn from(e: quick_xml::events::attributes::AttrError) -> Self {
DocumentError::XmlAttr(e)
}
}
impl From<calamine::Error> for DocumentError {
fn from(e: calamine::Error) -> Self {
DocumentError::Calamine(e)
}
}
impl From<calamine::XlsxError> for DocumentError {
fn from(e: calamine::XlsxError) -> Self {
DocumentError::CalamineXlsx(e)
}
}
impl From<std::io::Error> for DocumentError {
fn from(e: std::io::Error) -> Self {
DocumentError::Io(e)
}
}
const ZIP_MAGIC: [u8; 4] = [0x50, 0x4B, 0x03, 0x04];
fn detect_document_type(bytes: &[u8]) -> Option<DocumentType> {
if bytes.len() < 4 || bytes[..4] != ZIP_MAGIC {
return None;
}
let cursor = std::io::Cursor::new(bytes);
let archive = zip::ZipArchive::new(cursor).ok()?;
let mut has_xl = false;
let mut has_word = false;
let mut has_ppt = false;
for i in 0..archive.len() {
if let Some(name) = archive.name_for_index(i) {
if name.starts_with("xl/") {
has_xl = true;
break;
} else if name.starts_with("word/") {
has_word = true;
break;
} else if name.starts_with("ppt/") {
has_ppt = true;
break;
}
}
}
if has_xl {
Some(DocumentType::Xlsx)
} else if has_word {
Some(DocumentType::Docx)
} else if has_ppt {
Some(DocumentType::Pptx)
} else {
None
}
}
pub(crate) fn try_convert_document(bytes: &[u8]) -> Option<String> {
let doc_type = detect_document_type(bytes)?;
let result = match doc_type {
DocumentType::Xlsx => xlsx::to_markdown(bytes),
DocumentType::Docx => docx::to_markdown(bytes),
DocumentType::Pptx => pptx::to_markdown(bytes),
};
result.ok()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_empty_bytes() {
assert!(detect_document_type(&[]).is_none());
}
#[test]
fn test_detect_non_zip() {
assert!(detect_document_type(b"hello world this is not a zip").is_none());
}
#[test]
fn test_detect_truncated_zip_magic() {
assert!(detect_document_type(&[0x50, 0x4B]).is_none());
}
#[test]
fn test_try_convert_garbage_no_panic() {
assert!(try_convert_document(&[]).is_none());
assert!(try_convert_document(&[0xFF; 100]).is_none());
assert!(try_convert_document(b"not a document at all").is_none());
}
#[test]
fn test_try_convert_truncated_zip_no_panic() {
let truncated = &[0x50, 0x4B, 0x03, 0x04, 0x00, 0x00];
assert!(try_convert_document(truncated).is_none());
}
}