Skip to main content

alith_knowledge/
html.rs

1use html_to_markdown::{TagHandler, markdown};
2use std::io::Read;
3use std::sync::RwLock;
4use std::{cell::RefCell, rc::Rc};
5use url::Url;
6
7use alith_core::{
8    chunking::{ChunkError, Chunker, chunk_text},
9    knowledge::{Knowledge, KnowledgeError},
10};
11
12pub struct HtmlKnowledge<R> {
13    html: RwLock<R>,
14    url: Url,
15    to_markdown: bool,
16}
17
18impl<R: Read> HtmlKnowledge<R> {
19    pub fn new(html: R, url: Url, to_markdown: bool) -> Self {
20        Self {
21            html: RwLock::new(html),
22            url,
23            to_markdown,
24        }
25    }
26}
27
28impl<R: Read + Send + Sync> Chunker for HtmlKnowledge<R> {
29    fn chunk(&self) -> std::result::Result<Vec<String>, ChunkError> {
30        Ok(chunk_text(
31            &self
32                .load()
33                .map_err(|err| ChunkError::Normal(err.to_string()))?,
34            self.chunk_size() as u32,
35            self.overlap_percent(),
36        )
37        .map_err(|err| ChunkError::Normal(err.to_string()))?
38        .unwrap_or_default())
39    }
40}
41
42impl<R: Read + Sync + Send> Knowledge for HtmlKnowledge<R> {
43    fn load(&self) -> Result<String, KnowledgeError> {
44        let mut html = self
45            .html
46            .write()
47            .map_err(|err| KnowledgeError::LoadError(err.to_string()))?;
48        let mut html = html.by_ref();
49        let cleaned_html = readability::extractor::extract(&mut html, &self.url)
50            .map_err(|err| KnowledgeError::LoadError(err.to_string()))?;
51        let html = format!("{}\n{}", cleaned_html.title, cleaned_html.text);
52        if self.to_markdown {
53            Ok(html_to_md(&html))
54        } else {
55            Ok(html)
56        }
57    }
58
59    fn enrich(&self, _input: &str) -> Result<String, KnowledgeError> {
60        Ok(format!("<html>{}</html>", self.load()?))
61    }
62}
63
64/// Converts the provided HTML string to Markdown string.
65pub fn html_to_md(html: &str) -> String {
66    let mut handlers: Vec<TagHandler> = vec![
67        Rc::new(RefCell::new(markdown::ParagraphHandler)),
68        Rc::new(RefCell::new(markdown::HeadingHandler)),
69        Rc::new(RefCell::new(markdown::ListHandler)),
70        Rc::new(RefCell::new(markdown::TableHandler::new())),
71        Rc::new(RefCell::new(markdown::StyledTextHandler)),
72        Rc::new(RefCell::new(markdown::CodeHandler)),
73        Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
74    ];
75
76    html_to_markdown::convert_html_to_markdown(html.as_bytes(), &mut handlers)
77        .unwrap_or_else(|_| html.to_string())
78}