processors_rs/
docx_processor.rs1use std::path::Path;
2use docx_parser::MarkdownDocument;
3use text_splitter::ChunkConfigError;
4use crate::markdown_processor::MarkdownProcessor;
5use crate::processor::{Document, DocumentProcessor, FileProcessor};
6
7pub struct DocxProcessor {
9 markdown_processor: MarkdownProcessor,
10}
11
12impl DocxProcessor {
13 pub fn new(chunk_size: usize, overlap: usize) -> Result<DocxProcessor, ChunkConfigError> {
14 let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
15 Ok(DocxProcessor {
16 markdown_processor,
17 })
18 }
19}
20
21impl FileProcessor for DocxProcessor {
22 fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
23 let docs = MarkdownDocument::from_file(path);
24 let markdown = docs.to_markdown(false);
25 self.markdown_processor.process_document(&markdown)
26 }
27}
28
29#[cfg(test)]
30mod tests {
31 use super::*;
32 #[test]
33 fn test_extract_text() {
34 let txt_file = "../test_files/test.docx";
35 let processor = DocxProcessor::new(128, 0).unwrap();
36
37 let text = processor.process_file(&txt_file).unwrap();
38 assert!(text.chunks.contains(&"This is a docx file test".to_string()));
39 }
40
41 #[test]
43 #[should_panic(expected = "Error processing file: IO(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })")]
44 fn test_extract_text_invalid_file_path() {
45 let invalid_file_path = "this_file_definitely_does_not_exist.docx";
46 let processor = DocxProcessor::new(128, 0).unwrap();
47 processor.process_file(&invalid_file_path).unwrap();
48 }
49}