processors_rs/
docx_processor.rs1use crate::markdown_processor::MarkdownProcessor;
2use crate::processor::{Document, DocumentProcessor, FileProcessor};
3use docx_parser::MarkdownDocument;
4use std::path::Path;
5use text_splitter::ChunkConfigError;
6
7pub struct DocxProcessor {
9 markdown_processor: MarkdownProcessor,
10}
11
12impl DocxProcessor {
13 pub fn new(chunk_size: usize, overlap: usize) -> Result<DocxProcessor, ChunkConfigError> {
14 let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
15 Ok(DocxProcessor { markdown_processor })
16 }
17}
18
19impl FileProcessor for DocxProcessor {
20 fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
21 let docs = MarkdownDocument::from_file(path);
22 let markdown = docs.to_markdown(false);
23 self.markdown_processor.process_document(&markdown)
24 }
25}
26
27#[cfg(test)]
28mod tests {
29 use super::*;
30 #[test]
31 fn test_extract_text() {
32 let txt_file = "../test_files/test.docx";
33 let processor = DocxProcessor::new(128, 0).unwrap();
34
35 let text = processor.process_file(txt_file).unwrap();
36 assert!(text
37 .chunks
38 .contains(&"This is a docx file test".to_string()));
39 }
40
41 #[test]
43 #[should_panic(
44 expected = "Error processing file: IO(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })"
45 )]
46 fn test_extract_text_invalid_file_path() {
47 let invalid_file_path = "this_file_definitely_does_not_exist.docx";
48 let processor = DocxProcessor::new(128, 0).unwrap();
49 processor.process_file(invalid_file_path).unwrap();
50 }
51}