processors_rs/
docx_processor.rs

1use std::path::Path;
2use docx_parser::MarkdownDocument;
3use text_splitter::ChunkConfigError;
4use crate::markdown_processor::MarkdownProcessor;
5use crate::processor::{Document, DocumentProcessor, FileProcessor};
6
7/// A struct for processing PDF files.
8pub struct DocxProcessor {
9    markdown_processor: MarkdownProcessor,
10}
11
12impl DocxProcessor {
13    pub fn new(chunk_size: usize, overlap: usize) -> Result<DocxProcessor, ChunkConfigError> {
14        let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
15        Ok(DocxProcessor {
16            markdown_processor,
17        })
18    }
19}
20
21impl FileProcessor for DocxProcessor {
22    fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
23        let docs = MarkdownDocument::from_file(path);
24        let markdown = docs.to_markdown(false);
25        self.markdown_processor.process_document(&markdown)
26    }
27}
28
29#[cfg(test)]
30mod tests {
31    use super::*;
32    #[test]
33    fn test_extract_text() {
34        let txt_file = "../test_files/test.docx";
35        let processor = DocxProcessor::new(128, 0).unwrap();
36
37        let text = processor.process_file(&txt_file).unwrap();
38        assert!(text.chunks.contains(&"This is a docx file test".to_string()));
39    }
40
41    // Returns an error if the file path is invalid.
42    #[test]
43    #[should_panic(expected = "Error processing file: IO(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })")]
44    fn test_extract_text_invalid_file_path() {
45        let invalid_file_path = "this_file_definitely_does_not_exist.docx";
46        let processor = DocxProcessor::new(128, 0).unwrap();
47        processor.process_file(&invalid_file_path).unwrap();
48    }
49}