processors_rs/
html_processor.rs1use anyhow::Result;
2use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
3use text_splitter::ChunkConfigError;
4use crate::markdown_processor::MarkdownProcessor;
5use crate::processor::{Document, DocumentProcessor};
6
7pub struct HtmlDocument {
8 pub content: String,
9 pub origin: Option<String>,
10}
11
12pub struct HtmlProcessor {
14 markdown_processor: MarkdownProcessor,
15 html_to_markdown: HtmlToMarkdown,
16}
17
18impl HtmlProcessor {
19 pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
20 let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
21 let html_to_markdown = HtmlToMarkdownBuilder::new()
22 .build();
23 Ok(HtmlProcessor {
24 markdown_processor,
25 html_to_markdown,
26 })
27 }
28}
29
30impl DocumentProcessor for HtmlProcessor {
31 fn process_document(&self, content: &str) -> Result<Document> {
32 let content = self.html_to_markdown.convert(content)?;
33 self.markdown_processor.process_document(&content)
34 }
35}
36
37#[cfg(test)]
38mod tests {
39 use crate::processor::FileProcessor;
40 use super::*;
41
42 #[test]
43 fn test_process_html_file() {
44 let html_processor = HtmlProcessor::new(128, 0).unwrap();
45 let html_file = "../test_files/test.html";
46 let result = html_processor.process_file(html_file);
47 assert!(result.is_ok());
48 }
49
50 #[test]
51 fn test_process_html_file_err() {
52 let html_processor = HtmlProcessor::new(128, 0).unwrap();
53 let html_file = "../test_files/some_file_that_doesnt_exist.html";
54 let result = html_processor.process_file(html_file);
55 assert!(result.is_err());
56 }
57}