processors_rs/
html_processor.rs1use crate::markdown_processor::MarkdownProcessor;
2use crate::processor::{Document, DocumentProcessor};
3use anyhow::Result;
4use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
5use regex::Regex;
6use text_splitter::ChunkConfigError;
7
8pub struct HtmlDocument {
9 pub content: String,
10 pub origin: Option<String>,
11}
12
13pub struct HtmlProcessor {
15 markdown_processor: MarkdownProcessor,
16 html_to_markdown: HtmlToMarkdown,
17}
18
19impl HtmlProcessor {
20 pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
21 let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
22 let html_to_markdown = HtmlToMarkdownBuilder::new().build();
23 Ok(HtmlProcessor {
24 markdown_processor,
25 html_to_markdown,
26 })
27 }
28}
29
30impl DocumentProcessor for HtmlProcessor {
31 fn process_document(&self, content: &str) -> Result<Document> {
32 let mut content = self.html_to_markdown.convert(content)?;
33
34 let link_regex = Regex::new(r"\[([^\]]+)\]\([^\)]+\)").unwrap();
37 content = link_regex.replace_all(&content, "$1").to_string();
38
39
40 self.markdown_processor.process_document(&content)
41 }
42}
43
44#[cfg(test)]
45mod tests {
46 use super::*;
47 use crate::processor::FileProcessor;
48
49 #[test]
50 fn test_process_html_file() {
51 let html_processor = HtmlProcessor::new(128, 0).unwrap();
52 let html_file = "../test_files/test.html";
53 let result = html_processor.process_file(html_file);
54 assert!(result.is_ok());
55 }
56
57 #[test]
58 fn test_process_html_file_err() {
59 let html_processor = HtmlProcessor::new(128, 0).unwrap();
60 let html_file = "../test_files/some_file_that_doesnt_exist.html";
61 let result = html_processor.process_file(html_file);
62 assert!(result.is_err());
63 }
64}