use crate::container::decode_xml_bytes;
use crate::error::{Error, Result};
use std::fs::File;
use std::io::{BufReader, Read, Seek};
use std::path::Path;
const ZIP_MAGIC: [u8; 4] = [0x50, 0x4B, 0x03, 0x04];
const DOCX_CONTENT_TYPE: &str =
"application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
const XLSX_CONTENT_TYPE: &str =
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml";
const PPTX_CONTENT_TYPE: &str =
"application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FormatType {
Docx,
Xlsx,
Pptx,
}
impl FormatType {
pub fn extension(&self) -> &'static str {
match self {
FormatType::Docx => "docx",
FormatType::Xlsx => "xlsx",
FormatType::Pptx => "pptx",
}
}
pub fn name(&self) -> &'static str {
match self {
FormatType::Docx => "Word Document",
FormatType::Xlsx => "Excel Workbook",
FormatType::Pptx => "PowerPoint Presentation",
}
}
}
impl std::fmt::Display for FormatType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.name())
}
}
pub fn detect_format_from_path(path: impl AsRef<Path>) -> Result<FormatType> {
let file = File::open(path.as_ref())?;
let reader = BufReader::new(file);
detect_format_from_reader(reader)
}
pub fn detect_format_from_bytes(data: &[u8]) -> Result<FormatType> {
if data.len() < 4 || data[..4] != ZIP_MAGIC {
return Err(Error::UnknownFormat);
}
let cursor = std::io::Cursor::new(data);
detect_format_from_reader(cursor)
}
pub fn detect_format_from_reader<R: Read + Seek>(reader: R) -> Result<FormatType> {
let mut archive = zip::ZipArchive::new(reader)?;
let content_types = match archive.by_name("[Content_Types].xml") {
Ok(mut file) => {
let mut bytes = Vec::new();
file.read_to_end(&mut bytes)?;
decode_xml_bytes(&bytes)?
}
Err(_) => {
return Err(Error::MissingComponent("[Content_Types].xml".to_string()));
}
};
if content_types.contains(DOCX_CONTENT_TYPE) {
Ok(FormatType::Docx)
} else if content_types.contains(XLSX_CONTENT_TYPE) {
Ok(FormatType::Xlsx)
} else if content_types.contains(PPTX_CONTENT_TYPE) {
Ok(FormatType::Pptx)
} else {
detect_by_folder_structure(&mut archive)
}
}
fn detect_by_folder_structure<R: Read + Seek>(
archive: &mut zip::ZipArchive<R>,
) -> Result<FormatType> {
let names: Vec<String> = archive.file_names().map(String::from).collect();
let has_word = names.iter().any(|n| n.starts_with("word/"));
let has_xl = names.iter().any(|n| n.starts_with("xl/"));
let has_ppt = names.iter().any(|n| n.starts_with("ppt/"));
match (has_word, has_xl, has_ppt) {
(true, false, false) => Ok(FormatType::Docx),
(false, true, false) => Ok(FormatType::Xlsx),
(false, false, true) => Ok(FormatType::Pptx),
_ => Err(Error::UnknownFormat),
}
}
pub fn is_zip_file(data: &[u8]) -> bool {
data.len() >= 4 && data[..4] == ZIP_MAGIC
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_format_type_display() {
assert_eq!(FormatType::Docx.to_string(), "Word Document");
assert_eq!(FormatType::Xlsx.to_string(), "Excel Workbook");
assert_eq!(FormatType::Pptx.to_string(), "PowerPoint Presentation");
}
#[test]
fn test_format_type_extension() {
assert_eq!(FormatType::Docx.extension(), "docx");
assert_eq!(FormatType::Xlsx.extension(), "xlsx");
assert_eq!(FormatType::Pptx.extension(), "pptx");
}
#[test]
fn test_is_zip_file() {
assert!(is_zip_file(&[0x50, 0x4B, 0x03, 0x04, 0x00]));
assert!(!is_zip_file(&[0x00, 0x00, 0x00, 0x00]));
assert!(!is_zip_file(&[0x50, 0x4B])); }
#[test]
fn test_detect_invalid_data() {
let result = detect_format_from_bytes(&[0x00, 0x00, 0x00, 0x00]);
assert!(matches!(result, Err(Error::UnknownFormat)));
}
#[test]
fn test_detect_docx_from_file() {
let path = "test-files/file-sample_1MB.docx";
if std::path::Path::new(path).exists() {
let result = detect_format_from_path(path);
assert!(result.is_ok());
assert_eq!(result.unwrap(), FormatType::Docx);
}
}
#[test]
fn test_detect_xlsx_from_file() {
let path = "test-files/file_example_XLSX_5000.xlsx";
if std::path::Path::new(path).exists() {
let result = detect_format_from_path(path);
assert!(result.is_ok());
assert_eq!(result.unwrap(), FormatType::Xlsx);
}
}
#[test]
fn test_detect_pptx_from_file() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let result = detect_format_from_path(path);
assert!(result.is_ok());
assert_eq!(result.unwrap(), FormatType::Pptx);
}
}
}