use crate::common::processing::{test_office_convert_server_container, test_processing_layer};
use bytes::Bytes;
use docbox_database::models::generated_file::GeneratedFileType;
use docbox_processing::{ProcessingError, ProcessingLayerConfig, ProcessingOutput, process_file};
use std::path::Path;
mod common;
#[tokio::test]
async fn test_process_pdf() {
let output = process_sample_file("sample.pdf")
.await
.expect("pdf should produce processing output");
assert!(
!output.encrypted,
"File was marked as encrypted but should not be"
);
assert_eq!(
output.upload_queue.len(),
4,
"PDF file should produce 3 images and 1 text file"
);
let first = output.upload_queue.first().unwrap();
assert_eq!(first.mime, mime::IMAGE_JPEG);
assert!(matches!(first.ty, GeneratedFileType::CoverPage));
let second = output.upload_queue.get(1).unwrap();
assert_eq!(second.mime, mime::IMAGE_JPEG);
assert!(matches!(second.ty, GeneratedFileType::LargeThumbnail));
let third = output.upload_queue.get(2).unwrap();
assert_eq!(third.mime, mime::IMAGE_JPEG);
assert!(matches!(third.ty, GeneratedFileType::SmallThumbnail));
let forth = output.upload_queue.get(3).unwrap();
assert_eq!(forth.mime, mime::TEXT_PLAIN);
assert!(matches!(forth.ty, GeneratedFileType::TextContent));
let text_content = String::from_utf8_lossy(forth.bytes.as_ref());
assert_eq!(
text_content.as_ref().replace("\r\n", "\n"),
"Sample document\nThis is a second line\n\n\u{c}This is the second page\n\n\u{c}"
);
let index_metadata = output
.index_metadata
.expect("pdf file should produce index metadata");
let pages = index_metadata.pages.expect("pdf file should produce pages");
assert_eq!(pages.len(), 3);
let first_page = pages.first().unwrap();
assert_eq!(first_page.page, 0);
assert_eq!(
first_page.content.replace("\r\n", "\n"),
"Sample document\nThis is a second line\n\n"
);
let second_page = pages.get(1).unwrap();
assert_eq!(second_page.page, 1);
assert_eq!(
second_page.content.replace("\r\n", "\n"),
"This is the second page\n\n"
);
let third_page = pages.get(2).unwrap();
assert_eq!(third_page.page, 2);
assert_eq!(third_page.content, "");
assert!(
output.additional_files.is_empty(),
"PDF file should not produce additional files"
);
}
#[tokio::test]
async fn test_process_docx() {
test_process_document("sample.docx").await;
}
#[tokio::test]
async fn test_process_rtf() {
test_process_document("sample.rtf").await;
}
#[tokio::test]
async fn test_process_odt() {
test_process_document("sample.odt").await;
}
#[tokio::test]
async fn test_process_dotx() {
test_process_document("sample.dotx").await;
}
#[tokio::test]
async fn test_process_dot() {
test_process_document("sample.dot").await;
}
#[tokio::test]
async fn test_process_doc() {
test_process_document("sample.doc").await;
}
#[tokio::test]
async fn test_process_dotm() {
test_process_document("sample.dotm").await;
}
#[tokio::test]
async fn test_process_pdf_encrypted() {
test_process_encrypted("sample_encrypted.pdf").await;
}
#[tokio::test]
async fn test_process_docx_encrypted() {
test_process_encrypted("sample_encrypted.docx").await;
}
#[tokio::test]
async fn test_process_doc_encrypted() {
test_process_encrypted("sample_encrypted.doc").await;
}
#[tokio::test]
async fn test_process_docx_corrupted() {
let container = test_office_convert_server_container().await;
let processing_layer =
test_processing_layer(&container, ProcessingLayerConfig::default()).await;
let samples_path = Path::new("tests/samples/documents");
let sample_file = samples_path.join("sample_corrupted.docx");
let bytes = tokio::fs::read(&sample_file).await.unwrap();
let bytes = Bytes::from(bytes);
let mime = mime_guess::from_path(&sample_file).iter().next().unwrap();
let output = process_file(&None, &processing_layer, bytes, &mime)
.await
.unwrap_err();
assert!(
matches!(output, ProcessingError::MalformedFile(_),),
"corrupted file should produce a malformed document error got {output:?}"
);
}
#[tokio::test]
async fn test_process_xlsx() {
test_process_workbook("sample.xlsx").await;
}
#[tokio::test]
async fn test_process_xlsb() {
test_process_workbook("sample.xlsb").await;
}
#[tokio::test]
async fn test_process_xls() {
test_process_workbook("sample.xls").await;
}
#[tokio::test]
async fn test_process_xlsm() {
test_process_workbook("sample.xlsm").await;
}
#[tokio::test]
async fn test_process_xlt() {
test_process_workbook("sample.xlt").await;
}
#[tokio::test]
async fn test_process_xltm() {
test_process_workbook("sample.xltm").await;
}
#[tokio::test]
async fn test_process_xltx() {
test_process_workbook("sample.xltx").await;
}
#[tokio::test]
async fn test_process_ods() {
let output = process_sample_file("sample.ods")
.await
.expect("office file should produce output");
assert!(
!output.encrypted,
"file was marked as encrypted but should not be"
);
assert_eq!(
output.upload_queue.len(),
5,
"office file should produce 1 pdf, 3 images and 1 text file"
);
let first = output.upload_queue.first().unwrap();
assert_eq!(first.mime, mime::IMAGE_JPEG);
assert!(matches!(first.ty, GeneratedFileType::CoverPage));
let second = output.upload_queue.get(1).unwrap();
assert_eq!(second.mime, mime::IMAGE_JPEG);
assert!(matches!(second.ty, GeneratedFileType::LargeThumbnail));
let third = output.upload_queue.get(2).unwrap();
assert_eq!(third.mime, mime::IMAGE_JPEG);
assert!(matches!(third.ty, GeneratedFileType::SmallThumbnail));
let forth = output.upload_queue.get(3).unwrap();
assert_eq!(forth.mime, mime::TEXT_PLAIN);
assert!(matches!(forth.ty, GeneratedFileType::TextContent));
let fifth = output.upload_queue.get(4).unwrap();
assert_eq!(fifth.mime, mime::APPLICATION_PDF);
assert!(matches!(fifth.ty, GeneratedFileType::Pdf));
let text_content = String::from_utf8_lossy(forth.bytes.as_ref());
assert_eq!(
text_content.as_ref().replace("\r\n", "\n"),
"Sample Sample 1Sample 2\n\n\u{c}"
);
let index_metadata = output
.index_metadata
.as_ref()
.expect("office file should produce index metadata");
let pages = index_metadata
.pages
.as_ref()
.expect("office file should produce pages");
assert_eq!(pages.len(), 2);
let first_page = pages.first().unwrap();
assert_eq!(first_page.page, 0);
assert_eq!(
first_page.content.replace("\r\n", "\n"),
"Sample Sample 1Sample 2\n\n"
);
let second_page = pages.get(1).unwrap();
assert_eq!(second_page.page, 1);
assert_eq!(second_page.content, "");
assert!(
output.additional_files.is_empty(),
"office file should not produce additional files"
);
}
#[tokio::test]
async fn test_process_xlsx_encrypted() {
test_process_encrypted("sample_encrypted.xlsx").await;
}
#[tokio::test]
async fn test_process_xls_encrypted() {
test_process_encrypted("sample_encrypted.xls").await;
}
async fn process_sample_file(sample_file: &str) -> Option<ProcessingOutput> {
let container = test_office_convert_server_container().await;
let processing_layer =
test_processing_layer(&container, ProcessingLayerConfig::default()).await;
let samples_path = Path::new("tests/samples/documents");
let sample_file = samples_path.join(sample_file);
let bytes = tokio::fs::read(&sample_file).await.unwrap();
let bytes = Bytes::from(bytes);
let mime = mime_guess::from_path(&sample_file).iter().next().unwrap();
process_file(&None, &processing_layer, bytes, &mime)
.await
.unwrap()
}
async fn test_process_encrypted(sample_file: &str) {
let output = process_sample_file(sample_file)
.await
.expect("office file should produce output");
assert!(
output.encrypted,
"File was not marked as encrypted but should be"
);
assert!(
output.upload_queue.is_empty(),
"Encrypted file should not produce uploads"
);
assert!(
output.index_metadata.is_none(),
"Encrypted file should not produce index metadata"
);
assert!(
output.additional_files.is_empty(),
"Encrypted file should not produce additional files"
);
}
async fn test_process_workbook(sample_file: &str) {
let output = process_sample_file(sample_file)
.await
.expect("office file should produce output");
validate_workbook_output(&output);
}
async fn test_process_document(sample_file: &str) {
let output = process_sample_file(sample_file)
.await
.expect("office file should produce output");
validate_document_output(&output);
}
fn validate_document_output(output: &ProcessingOutput) {
assert!(
!output.encrypted,
"file was marked as encrypted but should not be"
);
assert_eq!(
output.upload_queue.len(),
5,
"office file should produce 1 pdf, 3 images and 1 text file"
);
let first = output.upload_queue.first().unwrap();
assert_eq!(first.mime, mime::IMAGE_JPEG);
assert!(matches!(first.ty, GeneratedFileType::CoverPage));
let second = output.upload_queue.get(1).unwrap();
assert_eq!(second.mime, mime::IMAGE_JPEG);
assert!(matches!(second.ty, GeneratedFileType::LargeThumbnail));
let third = output.upload_queue.get(2).unwrap();
assert_eq!(third.mime, mime::IMAGE_JPEG);
assert!(matches!(third.ty, GeneratedFileType::SmallThumbnail));
let forth = output.upload_queue.get(3).unwrap();
assert_eq!(forth.mime, mime::TEXT_PLAIN);
assert!(matches!(forth.ty, GeneratedFileType::TextContent));
let fifth = output.upload_queue.get(4).unwrap();
assert_eq!(fifth.mime, mime::APPLICATION_PDF);
assert!(matches!(fifth.ty, GeneratedFileType::Pdf));
let text_content = String::from_utf8_lossy(forth.bytes.as_ref());
assert_eq!(
text_content.as_ref().replace("\r\n", "\n"),
"Sample document\nThis is a second line\n\n\u{c}This is the second page\n\n\u{c}"
);
let index_metadata = output
.index_metadata
.as_ref()
.expect("office file should produce index metadata");
let pages = index_metadata
.pages
.as_ref()
.expect("office file should produce pages");
assert_eq!(pages.len(), 3);
let first_page = pages.first().unwrap();
assert_eq!(first_page.page, 0);
assert_eq!(
first_page.content.replace("\r\n", "\n"),
"Sample document\nThis is a second line\n\n"
);
let second_page = pages.get(1).unwrap();
assert_eq!(second_page.page, 1);
assert_eq!(
second_page.content.replace("\r\n", "\n"),
"This is the second page\n\n"
);
let third_page = pages.get(2).unwrap();
assert_eq!(third_page.page, 2);
assert_eq!(third_page.content, "");
assert!(
output.additional_files.is_empty(),
"office file should not produce additional files"
);
}
fn validate_workbook_output(output: &ProcessingOutput) {
assert!(
!output.encrypted,
"file was marked as encrypted but should not be"
);
assert_eq!(
output.upload_queue.len(),
5,
"office file should produce 1 pdf, 3 images and 1 text file"
);
let first = output.upload_queue.first().unwrap();
assert_eq!(first.mime, mime::IMAGE_JPEG);
assert!(matches!(first.ty, GeneratedFileType::CoverPage));
let second = output.upload_queue.get(1).unwrap();
assert_eq!(second.mime, mime::IMAGE_JPEG);
assert!(matches!(second.ty, GeneratedFileType::LargeThumbnail));
let third = output.upload_queue.get(2).unwrap();
assert_eq!(third.mime, mime::IMAGE_JPEG);
assert!(matches!(third.ty, GeneratedFileType::SmallThumbnail));
let forth = output.upload_queue.get(3).unwrap();
assert_eq!(forth.mime, mime::TEXT_PLAIN);
assert!(matches!(forth.ty, GeneratedFileType::TextContent));
let fifth = output.upload_queue.get(4).unwrap();
assert_eq!(fifth.mime, mime::APPLICATION_PDF);
assert!(matches!(fifth.ty, GeneratedFileType::Pdf));
let text_content = String::from_utf8_lossy(forth.bytes.as_ref());
assert_eq!(
text_content.as_ref().replace("\r\n", "\n"),
"Sample\n\nSample 1 Sample 2\n\n\u{c}"
);
let index_metadata = output
.index_metadata
.as_ref()
.expect("office file should produce index metadata");
let pages = index_metadata
.pages
.as_ref()
.expect("office file should produce pages");
assert_eq!(pages.len(), 2);
let first_page = pages.first().unwrap();
assert_eq!(first_page.page, 0);
assert_eq!(
first_page.content.replace("\r\n", "\n"),
"Sample\n\nSample 1 Sample 2\n\n"
);
let second_page = pages.get(1).unwrap();
assert_eq!(second_page.page, 1);
assert_eq!(second_page.content, "");
assert!(
output.additional_files.is_empty(),
"office file should not produce additional files"
);
}