processors_rs/
html_processor.rs

1use crate::markdown_processor::MarkdownProcessor;
2use crate::processor::{Document, DocumentProcessor};
3use anyhow::Result;
4use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
5use text_splitter::ChunkConfigError;
6
7pub struct HtmlDocument {
8    pub content: String,
9    pub origin: Option<String>,
10}
11
12/// A Struct for processing HTML files.
13pub struct HtmlProcessor {
14    markdown_processor: MarkdownProcessor,
15    html_to_markdown: HtmlToMarkdown,
16}
17
18impl HtmlProcessor {
19    pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
20        let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
21        let html_to_markdown = HtmlToMarkdownBuilder::new().build();
22        Ok(HtmlProcessor {
23            markdown_processor,
24            html_to_markdown,
25        })
26    }
27}
28
29impl DocumentProcessor for HtmlProcessor {
30    fn process_document(&self, content: &str) -> Result<Document> {
31        let content = self.html_to_markdown.convert(content)?;
32        self.markdown_processor.process_document(&content)
33    }
34}
35
36#[cfg(test)]
37mod tests {
38    use super::*;
39    use crate::processor::FileProcessor;
40
41    #[test]
42    fn test_process_html_file() {
43        let html_processor = HtmlProcessor::new(128, 0).unwrap();
44        let html_file = "../test_files/test.html";
45        let result = html_processor.process_file(html_file);
46        assert!(result.is_ok());
47    }
48
49    #[test]
50    fn test_process_html_file_err() {
51        let html_processor = HtmlProcessor::new(128, 0).unwrap();
52        let html_file = "../test_files/some_file_that_doesnt_exist.html";
53        let result = html_processor.process_file(html_file);
54        assert!(result.is_err());
55    }
56}