processors_rs/
html_processor.rs

1use crate::markdown_processor::MarkdownProcessor;
2use crate::processor::{Document, DocumentProcessor};
3use anyhow::Result;
4use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
5use regex::Regex;
6use text_splitter::ChunkConfigError;
7
8pub struct HtmlDocument {
9    pub content: String,
10    pub origin: Option<String>,
11}
12
13/// A Struct for processing HTML files.
14pub struct HtmlProcessor {
15    markdown_processor: MarkdownProcessor,
16    html_to_markdown: HtmlToMarkdown,
17}
18
19impl HtmlProcessor {
20    pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
21        let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
22        let html_to_markdown = HtmlToMarkdownBuilder::new().build();
23        Ok(HtmlProcessor {
24            markdown_processor,
25            html_to_markdown,
26        })
27    }
28}
29
30impl DocumentProcessor for HtmlProcessor {
31    fn process_document(&self, content: &str) -> Result<Document> {
32        let mut content = self.html_to_markdown.convert(content)?;
33        
34        // Remove markdown hyperlinks: [text](url) -> text
35        // This regex matches markdown link syntax and replaces it with just the link text
36        let link_regex = Regex::new(r"\[([^\]]+)\]\([^\)]+\)").unwrap();
37        content = link_regex.replace_all(&content, "$1").to_string();
38        
39
40        self.markdown_processor.process_document(&content)
41    }
42}
43
44#[cfg(test)]
45mod tests {
46    use super::*;
47    use crate::processor::FileProcessor;
48
49    #[test]
50    fn test_process_html_file() {
51        let html_processor = HtmlProcessor::new(128, 0).unwrap();
52        let html_file = "../test_files/test.html";
53        let result = html_processor.process_file(html_file);
54        assert!(result.is_ok());
55    }
56
57    #[test]
58    fn test_process_html_file_err() {
59        let html_processor = HtmlProcessor::new(128, 0).unwrap();
60        let html_file = "../test_files/some_file_that_doesnt_exist.html";
61        let result = html_processor.process_file(html_file);
62        assert!(result.is_err());
63    }
64}