processors_rs/
html_processor.rs

1use anyhow::Result;
2use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
3use text_splitter::ChunkConfigError;
4use crate::markdown_processor::MarkdownProcessor;
5use crate::processor::{Document, DocumentProcessor};
6
7pub struct HtmlDocument {
8    pub content: String,
9    pub origin: Option<String>,
10}
11
12/// A Struct for processing HTML files.
13pub struct HtmlProcessor {
14    markdown_processor: MarkdownProcessor,
15    html_to_markdown: HtmlToMarkdown,
16}
17
18impl HtmlProcessor {
19    pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
20        let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
21        let html_to_markdown = HtmlToMarkdownBuilder::new()
22            .build();
23        Ok(HtmlProcessor {
24            markdown_processor,
25            html_to_markdown,
26        })
27    }
28}
29
30impl DocumentProcessor for HtmlProcessor {
31    fn process_document(&self, content: &str) -> Result<Document> {
32        let content = self.html_to_markdown.convert(content)?;
33        self.markdown_processor.process_document(&content)
34    }
35}
36
37#[cfg(test)]
38mod tests {
39    use crate::processor::FileProcessor;
40    use super::*;
41
42    #[test]
43    fn test_process_html_file() {
44        let html_processor = HtmlProcessor::new(128, 0).unwrap();
45        let html_file = "../test_files/test.html";
46        let result = html_processor.process_file(html_file);
47        assert!(result.is_ok());
48    }
49
50    #[test]
51    fn test_process_html_file_err() {
52        let html_processor = HtmlProcessor::new(128, 0).unwrap();
53        let html_file = "../test_files/some_file_that_doesnt_exist.html";
54        let result = html_processor.process_file(html_file);
55        assert!(result.is_err());
56    }
57}