Skip to main content

lean_ctx/core/web/
html_to_text.rs

1//! Dependency-free HTML → Markdown / plain-text conversion.
2//!
3//! A small tag tokenizer feeds a state-machine renderer. The goal is *clean,
4//! readable* content for an LLM, not a faithful DOM: noise elements (script,
5//! style, nav chrome) are dropped, block structure becomes Markdown headings /
6//! lists / paragraphs, links become `[text](href)`, and `<pre>` becomes fenced
7//! code. Implemented without an HTML crate to stay in line with the project's
8//! zero-heavy-dependency stance.
9
10/// A hyperlink extracted from the document.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct Link {
13    pub text: String,
14    pub href: String,
15}
16
17/// Parsed document: optional `<title>`, rendered Markdown, and extracted links.
18#[derive(Debug, Clone)]
19pub struct HtmlDoc {
20    pub title: Option<String>,
21    pub markdown: String,
22    pub links: Vec<Link>,
23}
24
25/// Convert an HTML document into Markdown plus extracted metadata.
26pub fn parse(html: &str) -> HtmlDoc {
27    let title = extract_title(html);
28    let content = select_main(html);
29    let mut renderer = Renderer::default();
30    for token in tokenize(content) {
31        renderer.consume(&token);
32    }
33    let markdown = normalize(&renderer.out);
34    HtmlDoc {
35        title,
36        markdown,
37        links: renderer.links,
38    }
39}
40
41/// Extract just the document `<title>` without rendering the body.
42pub fn title(html: &str) -> Option<String> {
43    extract_title(html)
44}
45
46/// Strip Markdown decorations to obtain flowing plain text.
47pub fn markdown_to_text(markdown: &str) -> String {
48    let mut out = String::with_capacity(markdown.len());
49    let mut in_fence = false;
50    for line in markdown.lines() {
51        if line.trim_start().starts_with("```") {
52            in_fence = !in_fence;
53            continue;
54        }
55        if in_fence {
56            out.push_str(line);
57            out.push('\n');
58            continue;
59        }
60        let stripped = strip_inline_markup(line);
61        out.push_str(&stripped);
62        out.push('\n');
63    }
64    out.trim().to_string()
65}
66
67fn strip_inline_markup(line: &str) -> String {
68    let without_heading = line.trim_start().trim_start_matches('#').trim_start();
69    replace_links_with_text(without_heading)
70}
71
72/// Replace `[text](href)` spans with their visible text only.
73fn replace_links_with_text(s: &str) -> String {
74    let mut out = String::with_capacity(s.len());
75    let bytes = s.as_bytes();
76    let mut i = 0;
77    while i < bytes.len() {
78        if bytes[i] == b'[' {
79            if let Some(rel_close) = s[i + 1..].find(']') {
80                let close = i + 1 + rel_close;
81                if s[close + 1..].starts_with('(') {
82                    if let Some(rel_paren) = s[close + 2..].find(')') {
83                        out.push_str(&s[i + 1..close]);
84                        i = close + 2 + rel_paren + 1;
85                        continue;
86                    }
87                }
88            }
89        }
90        // Copy a whole UTF-8 char starting at i.
91        let ch_len = utf8_len(bytes[i]);
92        out.push_str(&s[i..i + ch_len]);
93        i += ch_len;
94    }
95    out
96}
97
98fn utf8_len(first: u8) -> usize {
99    match first {
100        b if b < 0x80 => 1,
101        b if b >> 5 == 0b110 => 2,
102        b if b >> 4 == 0b1110 => 3,
103        _ => 4,
104    }
105}
106
107// ── Main-content selection ────────────────────────────────────────────────
108
109fn select_main(html: &str) -> &str {
110    if let Some(inner) = first_element_inner(html, "main") {
111        return inner;
112    }
113    if let Some(inner) = first_element_inner(html, "body") {
114        return inner;
115    }
116    html
117}
118
119/// Return the inner slice of the first `<tag ...> ... </tag>` (case-insensitive).
120fn first_element_inner<'a>(html: &'a str, tag: &str) -> Option<&'a str> {
121    let lower = html.to_ascii_lowercase();
122    let open_marker = format!("<{tag}");
123    let open_pos = lower.find(&open_marker)?;
124    // The char after the tag name must be a delimiter, not more name chars.
125    let after_name = open_pos + open_marker.len();
126    let delim_ok = lower[after_name..]
127        .chars()
128        .next()
129        .is_some_and(|c| c == '>' || c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '/');
130    if !delim_ok {
131        return None;
132    }
133    let gt = lower[open_pos..].find('>')? + open_pos;
134    let close_marker = format!("</{tag}");
135    let close_pos = lower[gt + 1..].find(&close_marker).map(|p| gt + 1 + p)?;
136    Some(&html[gt + 1..close_pos])
137}
138
139fn extract_title(html: &str) -> Option<String> {
140    let inner = first_element_inner(html, "title")?;
141    let decoded = decode_entities(inner);
142    let collapsed = collapse_ws(&decoded);
143    let trimmed = collapsed.trim();
144    if trimmed.is_empty() {
145        None
146    } else {
147        Some(trimmed.to_string())
148    }
149}
150
151// ── Tokenizer ─────────────────────────────────────────────────────────────
152
153enum Token<'a> {
154    Open {
155        name: String,
156        attrs: &'a str,
157        self_closing: bool,
158    },
159    Close {
160        name: String,
161    },
162    Text(&'a str),
163}
164
165fn tokenize(html: &str) -> Vec<Token<'_>> {
166    let bytes = html.as_bytes();
167    let n = bytes.len();
168    let mut tokens = Vec::new();
169    let mut i = 0;
170
171    while i < n {
172        if bytes[i] == b'<' {
173            if html[i..].starts_with("<!--") {
174                match html[i + 4..].find("-->") {
175                    Some(end) => i = i + 4 + end + 3,
176                    None => break,
177                }
178                continue;
179            }
180            if i + 1 < n && bytes[i + 1] == b'!' {
181                match html[i..].find('>') {
182                    Some(end) => i += end + 1,
183                    None => break,
184                }
185                continue;
186            }
187            if let Some(end) = tag_end(bytes, i) {
188                parse_tag(&html[i + 1..end], &mut tokens);
189                i = end + 1;
190            } else {
191                tokens.push(Token::Text(&html[i..]));
192                break;
193            }
194        } else {
195            let start = i;
196            while i < n && bytes[i] != b'<' {
197                i += 1;
198            }
199            tokens.push(Token::Text(&html[start..i]));
200        }
201    }
202    tokens
203}
204
205/// Index of the `>` that closes a tag opened at `start`, honoring quotes.
206fn tag_end(bytes: &[u8], start: usize) -> Option<usize> {
207    let mut i = start + 1;
208    let mut quote = 0u8;
209    while i < bytes.len() {
210        let b = bytes[i];
211        if quote != 0 {
212            if b == quote {
213                quote = 0;
214            }
215        } else if b == b'"' || b == b'\'' {
216            quote = b;
217        } else if b == b'>' {
218            return Some(i);
219        }
220        i += 1;
221    }
222    None
223}
224
225fn parse_tag<'a>(inner: &'a str, tokens: &mut Vec<Token<'a>>) {
226    let trimmed = inner.trim_start();
227    if let Some(rest) = trimmed.strip_prefix('/') {
228        let name = take_name(rest);
229        if !name.is_empty() {
230            tokens.push(Token::Close { name });
231        }
232        return;
233    }
234    let name = take_name(trimmed);
235    if name.is_empty() {
236        return;
237    }
238    let attrs = &trimmed[name.len()..];
239    let self_closing = trimmed.trim_end().ends_with('/');
240    tokens.push(Token::Open {
241        name,
242        attrs,
243        self_closing,
244    });
245}
246
247fn take_name(s: &str) -> String {
248    s.chars()
249        .take_while(|c| c.is_ascii_alphanumeric() || *c == '-' || *c == ':')
250        .collect::<String>()
251        .to_ascii_lowercase()
252}
253
254fn get_attr(attrs: &str, key: &str) -> Option<String> {
255    let lower = attrs.to_ascii_lowercase();
256    let mut from = 0;
257    while let Some(pos) = lower[from..].find(key) {
258        let idx = from + pos;
259        let boundary = idx == 0 || lower.as_bytes()[idx - 1].is_ascii_whitespace();
260        let after = idx + key.len();
261        let rest = attrs[after..].trim_start();
262        if boundary && rest.starts_with('=') {
263            return Some(parse_attr_value(rest[1..].trim_start()));
264        }
265        from = after;
266    }
267    None
268}
269
270fn parse_attr_value(s: &str) -> String {
271    let bytes = s.as_bytes();
272    if let Some(&q) = bytes.first() {
273        if q == b'"' || q == b'\'' {
274            let quote = q as char;
275            return match s[1..].find(quote) {
276                Some(end) => s[1..=end].to_string(),
277                None => s[1..].to_string(),
278            };
279        }
280    }
281    s.split_whitespace()
282        .next()
283        .unwrap_or("")
284        .trim_end_matches('/')
285        .to_string()
286}
287
288// ── Renderer ──────────────────────────────────────────────────────────────
289
290struct ListCtx {
291    ordered: bool,
292    index: usize,
293}
294
295#[derive(Default)]
296struct Renderer {
297    out: String,
298    links: Vec<Link>,
299    skip_depth: usize,
300    pre_depth: usize,
301    anchor: Option<(String, String)>,
302    list_stack: Vec<ListCtx>,
303}
304
305impl Renderer {
306    fn consume(&mut self, token: &Token<'_>) {
307        match token {
308            Token::Text(t) => self.text(t),
309            Token::Open {
310                name,
311                attrs,
312                self_closing,
313            } => self.open(name, attrs, *self_closing),
314            Token::Close { name } => self.close(name),
315        }
316    }
317
318    fn text(&mut self, raw: &str) {
319        if self.skip_depth > 0 {
320            return;
321        }
322        let decoded = decode_entities(raw);
323        if self.pre_depth > 0 {
324            self.out.push_str(&decoded);
325            return;
326        }
327        let collapsed = collapse_ws(&decoded);
328        if collapsed.is_empty() {
329            return;
330        }
331        match self.anchor {
332            Some((_, ref mut buf)) => buf.push_str(&collapsed),
333            None => self.out.push_str(&collapsed),
334        }
335    }
336
337    fn open(&mut self, name: &str, attrs: &str, self_closing: bool) {
338        if self.skip_depth > 0 {
339            if is_skip(name) && !self_closing && !is_void(name) {
340                self.skip_depth += 1;
341            }
342            return;
343        }
344        if is_skip(name) {
345            if !self_closing && !is_void(name) {
346                self.skip_depth += 1;
347            }
348            return;
349        }
350        if self_closing || is_void(name) {
351            self.open_void(name);
352            return;
353        }
354
355        match name {
356            "a" => self.open_anchor(attrs),
357            "pre" => {
358                self.block_break();
359                self.out.push_str("```");
360                self.newline();
361                self.pre_depth += 1;
362            }
363            "code" if self.pre_depth == 0 => self.out.push('`'),
364            "ul" => {
365                self.list_stack.push(ListCtx {
366                    ordered: false,
367                    index: 0,
368                });
369                self.block_break();
370            }
371            "ol" => {
372                self.list_stack.push(ListCtx {
373                    ordered: true,
374                    index: 0,
375                });
376                self.block_break();
377            }
378            "li" => {
379                self.newline();
380                let marker = match self.list_stack.last_mut() {
381                    Some(ctx) if ctx.ordered => {
382                        ctx.index += 1;
383                        format!("{}. ", ctx.index)
384                    }
385                    _ => "- ".to_string(),
386                };
387                self.out.push_str(&marker);
388            }
389            "tr" => self.newline(),
390            "blockquote" => {
391                self.block_break();
392                self.out.push_str("> ");
393            }
394            h if is_heading(h) => {
395                self.block_break();
396                for _ in 0..heading_level(h) {
397                    self.out.push('#');
398                }
399                self.out.push(' ');
400            }
401            b if is_block(b) => self.block_break(),
402            _ => {}
403        }
404    }
405
406    fn open_void(&mut self, name: &str) {
407        match name {
408            "br" => self.newline(),
409            "hr" => {
410                self.block_break();
411                self.out.push_str("---");
412                self.block_break();
413            }
414            _ => {}
415        }
416    }
417
418    fn open_anchor(&mut self, attrs: &str) {
419        if self.anchor.is_some() {
420            return;
421        }
422        if let Some(href) = get_attr(attrs, "href") {
423            let href = href.trim();
424            if !href.is_empty() && !href.starts_with("javascript:") && !href.starts_with('#') {
425                self.anchor = Some((href.to_string(), String::new()));
426            }
427        }
428    }
429
430    fn close(&mut self, name: &str) {
431        if self.skip_depth > 0 {
432            if is_skip(name) {
433                self.skip_depth -= 1;
434            }
435            return;
436        }
437        match name {
438            "a" => {
439                if let Some((href, text)) = self.anchor.take() {
440                    let text = text.trim();
441                    if !text.is_empty() {
442                        self.out.push_str(&format!("[{text}]({href})"));
443                        self.links.push(Link {
444                            text: text.to_string(),
445                            href,
446                        });
447                    }
448                }
449            }
450            "pre" => {
451                self.pre_depth = self.pre_depth.saturating_sub(1);
452                self.newline();
453                self.out.push_str("```");
454                self.block_break();
455            }
456            "code" if self.pre_depth == 0 => self.out.push('`'),
457            "ul" | "ol" => {
458                self.list_stack.pop();
459                self.block_break();
460            }
461            "td" | "th" => self.out.push_str(" | "),
462            h if is_heading(h) => self.block_break(),
463            b if is_block(b) => self.block_break(),
464            _ => {}
465        }
466    }
467
468    fn newline(&mut self) {
469        if !self.out.ends_with('\n') {
470            self.out.push('\n');
471        }
472    }
473
474    fn block_break(&mut self) {
475        while self.out.ends_with(' ') {
476            self.out.pop();
477        }
478        if self.out.is_empty() {
479            return;
480        }
481        if self.out.ends_with("\n\n") {
482            return;
483        }
484        if self.out.ends_with('\n') {
485            self.out.push('\n');
486        } else {
487            self.out.push_str("\n\n");
488        }
489    }
490}
491
492fn is_skip(name: &str) -> bool {
493    matches!(
494        name,
495        "script"
496            | "style"
497            | "noscript"
498            | "svg"
499            | "template"
500            | "iframe"
501            | "head"
502            | "object"
503            | "embed"
504            | "canvas"
505            | "math"
506    )
507}
508
509fn is_void(name: &str) -> bool {
510    matches!(
511        name,
512        "br" | "hr"
513            | "img"
514            | "input"
515            | "meta"
516            | "link"
517            | "source"
518            | "col"
519            | "area"
520            | "base"
521            | "wbr"
522            | "track"
523            | "param"
524    )
525}
526
527fn is_block(name: &str) -> bool {
528    matches!(
529        name,
530        "p" | "div"
531            | "section"
532            | "article"
533            | "main"
534            | "header"
535            | "footer"
536            | "aside"
537            | "nav"
538            | "dl"
539            | "dd"
540            | "dt"
541            | "table"
542            | "thead"
543            | "tbody"
544            | "tfoot"
545            | "figure"
546            | "figcaption"
547            | "address"
548            | "form"
549            | "fieldset"
550            | "details"
551            | "summary"
552    )
553}
554
555fn is_heading(name: &str) -> bool {
556    name.len() == 2 && name.starts_with('h') && matches!(name.as_bytes()[1], b'1'..=b'6')
557}
558
559fn heading_level(name: &str) -> usize {
560    (name.as_bytes()[1] - b'0') as usize
561}
562
563// ── Whitespace + entities ──────────────────────────────────────────────────
564
565fn collapse_ws(s: &str) -> String {
566    let mut out = String::with_capacity(s.len());
567    let mut prev_space = false;
568    for c in s.chars() {
569        if c.is_whitespace() {
570            if !prev_space {
571                out.push(' ');
572                prev_space = true;
573            }
574        } else {
575            out.push(c);
576            prev_space = false;
577        }
578    }
579    out
580}
581
582fn normalize(s: &str) -> String {
583    let mut result = String::with_capacity(s.len());
584    let mut in_fence = false;
585    let mut blank_run = 0;
586
587    for line in s.lines() {
588        if line.trim() == "```" {
589            in_fence = !in_fence;
590            result.push_str("```\n");
591            blank_run = 0;
592            continue;
593        }
594        if in_fence {
595            result.push_str(line);
596            result.push('\n');
597            continue;
598        }
599        let trimmed = line.trim();
600        if trimmed.is_empty() {
601            blank_run += 1;
602            if blank_run <= 1 {
603                result.push('\n');
604            }
605            continue;
606        }
607        blank_run = 0;
608        result.push_str(trimmed);
609        result.push('\n');
610    }
611    result.trim().to_string()
612}
613
614/// Decode HTML/XML character entities (`&amp;`, `&#39;`, `&#x2019;`, …).
615///
616/// Exposed for sibling modules (e.g. the YouTube srv3 transcript parser) so
617/// entity handling lives in exactly one place.
618pub fn decode_entities(s: &str) -> String {
619    if !s.contains('&') {
620        return s.to_string();
621    }
622    let mut out = String::with_capacity(s.len());
623    let bytes = s.as_bytes();
624    let mut i = 0;
625    while i < bytes.len() {
626        if bytes[i] == b'&' {
627            if let Some(rel_end) = s[i..].find(';') {
628                let end = i + rel_end;
629                let entity = &s[i + 1..end];
630                if let Some(decoded) = decode_one(entity) {
631                    out.push_str(&decoded);
632                    i = end + 1;
633                    continue;
634                }
635            }
636            out.push('&');
637            i += 1;
638        } else {
639            let ch_len = utf8_len(bytes[i]);
640            out.push_str(&s[i..i + ch_len]);
641            i += ch_len;
642        }
643    }
644    out
645}
646
647fn decode_one(entity: &str) -> Option<String> {
648    if let Some(num) = entity.strip_prefix('#') {
649        let code = if let Some(hex) = num.strip_prefix(['x', 'X']) {
650            u32::from_str_radix(hex, 16).ok()?
651        } else {
652            num.parse::<u32>().ok()?
653        };
654        return char::from_u32(code).map(|c| c.to_string());
655    }
656    let named = match entity {
657        "amp" => "&",
658        "lt" => "<",
659        "gt" => ">",
660        "quot" => "\"",
661        "apos" => "'",
662        "nbsp" => " ",
663        "mdash" => "—",
664        "ndash" => "–",
665        "hellip" => "…",
666        "copy" => "©",
667        "reg" => "®",
668        "trade" => "™",
669        "laquo" => "«",
670        "raquo" => "»",
671        "lsquo" => "‘",
672        "rsquo" => "’",
673        "ldquo" => "“",
674        "rdquo" => "”",
675        "bull" => "•",
676        "middot" => "·",
677        "euro" => "€",
678        "pound" => "£",
679        "deg" => "°",
680        "times" => "×",
681        "divide" => "÷",
682        _ => return None,
683    };
684    Some(named.to_string())
685}
686
687#[cfg(test)]
688mod tests {
689    use super::*;
690
691    #[test]
692    fn extracts_title_and_decodes() {
693        let doc =
694            parse("<html><head><title>Foo &amp; Bar</title></head><body><p>Hi</p></body></html>");
695        assert_eq!(doc.title.as_deref(), Some("Foo & Bar"));
696    }
697
698    #[test]
699    fn drops_script_and_style() {
700        let html = "<body><script>var x=1;</script><style>.a{}</style><p>Visible</p></body>";
701        let doc = parse(html);
702        assert_eq!(doc.markdown, "Visible");
703        assert!(!doc.markdown.contains("var x"));
704    }
705
706    #[test]
707    fn renders_headings_and_paragraphs() {
708        let html = "<body><h1>Title</h1><p>First.</p><p>Second.</p></body>";
709        let doc = parse(html);
710        assert_eq!(doc.markdown, "# Title\n\nFirst.\n\nSecond.");
711    }
712
713    #[test]
714    fn renders_links_and_collects_them() {
715        let html = r#"<body><p>See <a href="https://x.com/a">the site</a> now.</p></body>"#;
716        let doc = parse(html);
717        assert!(doc.markdown.contains("[the site](https://x.com/a)"));
718        assert_eq!(doc.links.len(), 1);
719        assert_eq!(doc.links[0].href, "https://x.com/a");
720        assert_eq!(doc.links[0].text, "the site");
721    }
722
723    #[test]
724    fn renders_unordered_and_ordered_lists() {
725        let html = "<body><ul><li>one</li><li>two</li></ul><ol><li>a</li><li>b</li></ol></body>";
726        let doc = parse(html);
727        assert!(doc.markdown.contains("- one"));
728        assert!(doc.markdown.contains("- two"));
729        assert!(doc.markdown.contains("1. a"));
730        assert!(doc.markdown.contains("2. b"));
731    }
732
733    #[test]
734    fn prefers_main_over_chrome() {
735        let html = "<body><nav><a href=/x>menu</a></nav><main><p>Core content</p></main><footer>foot</footer></body>";
736        let doc = parse(html);
737        assert_eq!(doc.markdown, "Core content");
738    }
739
740    #[test]
741    fn preserves_pre_as_fenced_code() {
742        let html = "<body><pre>line1\n  line2</pre></body>";
743        let doc = parse(html);
744        assert!(doc.markdown.contains("```"));
745        assert!(doc.markdown.contains("line1\n  line2"));
746    }
747
748    #[test]
749    fn markdown_to_text_strips_markup() {
750        let md = "# Heading\n\nSee [link](https://x.com) here.";
751        let text = markdown_to_text(md);
752        assert_eq!(text, "Heading\n\nSee link here.");
753    }
754
755    #[test]
756    fn handles_unterminated_tag_gracefully() {
757        let doc = parse("<body><p>ok</p><broken");
758        assert!(doc.markdown.contains("ok"));
759    }
760
761    #[test]
762    fn decodes_numeric_entities() {
763        assert_eq!(decode_entities("A&#38;B&#x41;"), "A&BA");
764    }
765}