Skip to main content

llm_text/
html.rs

1use std::io::Cursor;
2
3use dom_smoothie::{Article, Config, Readability};
4use eyre::Result;
5use html2text::from_read;
6
7/// Clean HTML content and convert to Markdown-like structured text.
8/// This preserves structural elements like headings, lists, and tables
9/// which are valuable for LLM understanding.
10pub fn clean_html(html: &str) -> Result<String> {
11    let cfg = Config { max_elements_to_parse: 9000, ..Default::default() };
12    let mut readable = Readability::new(html, Some("http://example.com"), Some(cfg))?;
13    let article: Article = readable.parse()?;
14
15    // Convert HTML to structured text preserving headings, lists, links, etc.
16    let content = Cursor::new(article.content.as_bytes());
17    let text = from_read(content, 10000)?;
18
19    // Clean up excess whitespace while preserving structure
20    Ok(super::text::TextCleaner::new().reduce_newlines_to_double_newline().run(&text))
21}