processors_rs/
docx_processor.rs

1use crate::markdown_processor::MarkdownProcessor;
2use crate::processor::{Document, DocumentProcessor, FileProcessor};
3use docx_parser::MarkdownDocument;
4use std::path::Path;
5use text_splitter::ChunkConfigError;
6
7/// A struct for processing PDF files.
8pub struct DocxProcessor {
9    markdown_processor: MarkdownProcessor,
10}
11
12impl DocxProcessor {
13    pub fn new(chunk_size: usize, overlap: usize) -> Result<DocxProcessor, ChunkConfigError> {
14        let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
15        Ok(DocxProcessor { markdown_processor })
16    }
17}
18
19impl FileProcessor for DocxProcessor {
20    fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
21        let docs = MarkdownDocument::from_file(path);
22        let markdown = docs.to_markdown(false);
23        self.markdown_processor.process_document(&markdown)
24    }
25}
26
27#[cfg(test)]
28mod tests {
29    use super::*;
30    #[test]
31    fn test_extract_text() {
32        let txt_file = "../test_files/test.docx";
33        let processor = DocxProcessor::new(128, 0).unwrap();
34
35        let text = processor.process_file(txt_file).unwrap();
36        assert!(text
37            .chunks
38            .contains(&"This is a docx file test".to_string()));
39    }
40
41    // Returns an error if the file path is invalid.
42    #[test]
43    #[should_panic(
44        expected = "Error processing file: IO(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })"
45    )]
46    fn test_extract_text_invalid_file_path() {
47        let invalid_file_path = "this_file_definitely_does_not_exist.docx";
48        let processor = DocxProcessor::new(128, 0).unwrap();
49        processor.process_file(invalid_file_path).unwrap();
50    }
51}