#![allow(clippy::unwrap_used, clippy::expect_used, clippy::use_debug)]
use file_parser::domain::parser::FileParserBackend;
use file_parser::infra::parsers::docx_parser::DocxParser;
use std::path::PathBuf;
fn get_test_file_path(filename: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.join("testing/e2e/testdata/docx")
.join(filename)
}
#[tokio::test]
async fn test_docx_parser_basic_info() {
let parser = DocxParser::new();
assert_eq!(parser.id(), "docx");
assert_eq!(parser.supported_extensions(), &["docx"]);
}
#[tokio::test]
async fn test_docx_parser_with_working_file() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_multilingual.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
assert!(
result.is_ok(),
"Failed to parse working DOCX file: {:?}",
result.err()
);
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
assert!(document.meta.original_filename.is_some());
assert_eq!(
document.meta.original_filename.as_deref(),
Some("test_file_1table_multilingual.docx")
);
}
#[tokio::test]
async fn test_docx_parser_with_two_page_file() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_2pages_multilingual.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
if result.is_err() {
let error = result.err().unwrap();
let error_msg = error.to_string();
if error_msg.contains("malformed XML") || error_msg.contains("rotate_with_shape") {
eprintln!("Known docx-rust limitation with gradient fills: {error_msg}");
return; }
panic!("Unexpected error: {error_msg:?}");
}
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
}
#[tokio::test]
async fn test_docx_parser_with_big_english_file() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_big_english.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
if result.is_err() {
let error = result.err().unwrap();
let error_msg = error.to_string();
if error_msg.contains("malformed XML") || error_msg.contains("rotate_with_shape") {
eprintln!("Known docx-rust limitation with gradient fills: {error_msg}");
return; }
panic!("Unexpected error: {error_msg:?}");
}
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
}
#[tokio::test]
async fn test_docx_parser_with_edge_cases_file_returns_error() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_edge_cases.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
assert!(
result.is_ok(),
"Failed to parse DOCX from bytes: {:?}",
result.err()
);
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
assert!(document.meta.original_filename.is_some());
}
#[tokio::test]
async fn test_docx_parser_with_image_multilingual_file() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_image_multilingual.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
assert!(
result.is_ok(),
"Failed to parse image multilingual DOCX file: {:?}",
result.err()
);
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
eprintln!(
"Successfully parsed complex DOCX file with images ({} blocks)",
document.blocks.len()
);
}
#[tokio::test]
async fn test_docx_parser_parse_bytes() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_multilingual.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let file_bytes = std::fs::read(&test_file).expect("Failed to read test file");
let bytes = bytes::Bytes::from(file_bytes);
let result = parser
.parse_bytes(
Some("test_file_1table_multilingual.docx"),
Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
bytes,
)
.await;
assert!(
result.is_ok(),
"Failed to parse DOCX from bytes: {:?}",
result.err()
);
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
assert!(document.meta.original_filename.is_some());
}
#[tokio::test]
async fn test_docx_parser_parse_bytes_without_filename() {
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_multilingual.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let file_bytes = std::fs::read(&test_file).expect("Failed to read test file");
let bytes = bytes::Bytes::from(file_bytes);
let result = parser.parse_bytes(None, None, bytes).await;
assert!(
result.is_ok(),
"Failed to parse DOCX from bytes without filename: {:?}",
result.err()
);
let document = result.unwrap();
assert!(!document.blocks.is_empty(), "Document should have blocks");
assert_eq!(
document.meta.original_filename, None,
"Without filename hint, original_filename should be None"
);
}
#[tokio::test]
async fn test_docx_parser_nonexistent_file() {
let parser = DocxParser::new();
let nonexistent = PathBuf::from("/nonexistent/path/to/file.docx");
let result = parser.parse_local_path(&nonexistent).await;
assert!(result.is_err(), "Should fail on nonexistent file");
let error = result.err().unwrap();
let error_msg = error.to_string();
assert!(
error_msg.contains("Failed to read file")
|| error_msg.contains("No such file")
|| error_msg.contains("Failed to open DOCX file")
|| error_msg.contains("cannot find the path"),
"Error should mention file reading issue, got: {error_msg}"
);
}
#[tokio::test]
async fn test_docx_parser_invalid_docx_bytes() {
let parser = DocxParser::new();
let invalid_bytes = bytes::Bytes::from("This is not a valid DOCX file");
let result = parser
.parse_bytes(Some("invalid.docx"), None, invalid_bytes)
.await;
assert!(result.is_err(), "Should fail on invalid DOCX data");
let error = result.err().unwrap();
let error_msg = error.to_string();
assert!(
error_msg.contains("Failed to parse DOCX")
|| error_msg.contains("Failed to open DOCX file"),
"Error should mention DOCX parsing failure, got: {error_msg}"
);
}
#[tokio::test]
async fn test_docx_parser_extracts_tables() {
use file_parser::domain::ir::ParsedBlock;
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_multilingual.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
assert!(
result.is_ok(),
"Failed to parse DOCX file: {:?}",
result.err()
);
let document = result.unwrap();
let table_count = document
.blocks
.iter()
.filter(|block| matches!(block, ParsedBlock::Table(_)))
.count();
assert!(
table_count > 0,
"Document should contain at least one table"
);
for block in &document.blocks {
if let ParsedBlock::Table(table_block) = block {
assert!(
!table_block.rows.is_empty(),
"Table should have at least one row"
);
for (row_idx, row) in table_block.rows.iter().enumerate() {
assert!(
!row.cells.is_empty(),
"Row {row_idx} should have at least one cell"
);
for (cell_idx, cell) in row.cells.iter().enumerate() {
assert!(
!cell.blocks.is_empty(),
"Cell ({row_idx}, {cell_idx}) should have at least one block"
);
}
}
eprintln!(
"Table has {} rows with {} cells in first row",
table_block.rows.len(),
table_block.rows[0].cells.len()
);
}
}
}
#[tokio::test]
async fn test_docx_parser_table_edge_cases() {
use file_parser::domain::ir::ParsedBlock;
let parser = DocxParser::new();
let test_file = get_test_file_path("test_file_1table_edge_cases.docx");
if !test_file.exists() {
eprintln!("Skipping test: test file not found at {test_file:?}");
return;
}
let result = parser.parse_local_path(&test_file).await;
assert!(
result.is_ok(),
"Failed to parse DOCX file: {:?}",
result.err()
);
let document = result.unwrap();
let table_count = document
.blocks
.iter()
.filter(|block| matches!(block, ParsedBlock::Table(_)))
.count();
assert!(
table_count > 0,
"Document should contain at least one table"
);
}