processors_rs/
html_processor.rs1use crate::markdown_processor::MarkdownProcessor;
2use crate::processor::{Document, DocumentProcessor};
3use anyhow::Result;
4use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
5use text_splitter::ChunkConfigError;
6
7pub struct HtmlDocument {
8 pub content: String,
9 pub origin: Option<String>,
10}
11
12pub struct HtmlProcessor {
14 markdown_processor: MarkdownProcessor,
15 html_to_markdown: HtmlToMarkdown,
16}
17
18impl HtmlProcessor {
19 pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
20 let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
21 let html_to_markdown = HtmlToMarkdownBuilder::new().build();
22 Ok(HtmlProcessor {
23 markdown_processor,
24 html_to_markdown,
25 })
26 }
27}
28
29impl DocumentProcessor for HtmlProcessor {
30 fn process_document(&self, content: &str) -> Result<Document> {
31 let content = self.html_to_markdown.convert(content)?;
32 self.markdown_processor.process_document(&content)
33 }
34}
35
36#[cfg(test)]
37mod tests {
38 use super::*;
39 use crate::processor::FileProcessor;
40
41 #[test]
42 fn test_process_html_file() {
43 let html_processor = HtmlProcessor::new(128, 0).unwrap();
44 let html_file = "../test_files/test.html";
45 let result = html_processor.process_file(html_file);
46 assert!(result.is_ok());
47 }
48
49 #[test]
50 fn test_process_html_file_err() {
51 let html_processor = HtmlProcessor::new(128, 0).unwrap();
52 let html_file = "../test_files/some_file_that_doesnt_exist.html";
53 let result = html_processor.process_file(html_file);
54 assert!(result.is_err());
55 }
56}