1use html_to_markdown::{TagHandler, markdown};
2use std::io::Read;
3use std::sync::RwLock;
4use std::{cell::RefCell, rc::Rc};
5use url::Url;
6
7use alith_core::{
8 chunking::{ChunkError, Chunker, chunk_text},
9 knowledge::{Knowledge, KnowledgeError},
10};
11
12pub struct HtmlKnowledge<R> {
13 html: RwLock<R>,
14 url: Url,
15 to_markdown: bool,
16}
17
18impl<R: Read> HtmlKnowledge<R> {
19 pub fn new(html: R, url: Url, to_markdown: bool) -> Self {
20 Self {
21 html: RwLock::new(html),
22 url,
23 to_markdown,
24 }
25 }
26}
27
28impl<R: Read + Send + Sync> Chunker for HtmlKnowledge<R> {
29 fn chunk(&self) -> std::result::Result<Vec<String>, ChunkError> {
30 Ok(chunk_text(
31 &self
32 .load()
33 .map_err(|err| ChunkError::Normal(err.to_string()))?,
34 self.chunk_size() as u32,
35 self.overlap_percent(),
36 )
37 .map_err(|err| ChunkError::Normal(err.to_string()))?
38 .unwrap_or_default())
39 }
40}
41
42impl<R: Read + Sync + Send> Knowledge for HtmlKnowledge<R> {
43 fn load(&self) -> Result<String, KnowledgeError> {
44 let mut html = self
45 .html
46 .write()
47 .map_err(|err| KnowledgeError::LoadError(err.to_string()))?;
48 let mut html = html.by_ref();
49 let cleaned_html = readability::extractor::extract(&mut html, &self.url)
50 .map_err(|err| KnowledgeError::LoadError(err.to_string()))?;
51 let html = format!("{}\n{}", cleaned_html.title, cleaned_html.text);
52 if self.to_markdown {
53 Ok(html_to_md(&html))
54 } else {
55 Ok(html)
56 }
57 }
58
59 fn enrich(&self, _input: &str) -> Result<String, KnowledgeError> {
60 Ok(format!("<html>{}</html>", self.load()?))
61 }
62}
63
64pub fn html_to_md(html: &str) -> String {
66 let mut handlers: Vec<TagHandler> = vec![
67 Rc::new(RefCell::new(markdown::ParagraphHandler)),
68 Rc::new(RefCell::new(markdown::HeadingHandler)),
69 Rc::new(RefCell::new(markdown::ListHandler)),
70 Rc::new(RefCell::new(markdown::TableHandler::new())),
71 Rc::new(RefCell::new(markdown::StyledTextHandler)),
72 Rc::new(RefCell::new(markdown::CodeHandler)),
73 Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
74 ];
75
76 html_to_markdown::convert_html_to_markdown(html.as_bytes(), &mut handlers)
77 .unwrap_or_else(|_| html.to_string())
78}