Skip to main content

kbolt_core/ingest/
html.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use scraper::{ElementRef, Html, Node, Selector};
5
6use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
7use crate::Result;
8
9pub struct HtmlExtractor;
10
11impl Extractor for HtmlExtractor {
12    fn supports(&self) -> &[&str] {
13        &["html", "htm"]
14    }
15
16    fn profile_key(&self) -> &'static str {
17        "html"
18    }
19
20    fn version(&self) -> u32 {
21        4
22    }
23
24    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
25        let source = std::str::from_utf8(bytes).map_err(|err| {
26            kbolt_types::KboltError::InvalidInput(format!("non-utf8 html input: {err}"))
27        })?;
28        let document = Html::parse_document(source);
29        let mut state = ExtractionState::new(document_title(&document));
30
31        let body_selector = Selector::parse("body").expect("valid body selector");
32        let bodies = document.select(&body_selector).collect::<Vec<_>>();
33        if bodies.is_empty() {
34            state.walk_element(document.root_element());
35        } else {
36            for body in bodies {
37                state.walk_element(body);
38            }
39        }
40
41        Ok(ExtractedDocument {
42            blocks: state.blocks,
43            metadata: HashMap::new(),
44            title: state.title.or(state.first_h1),
45        })
46    }
47}
48
49struct ExtractionState {
50    blocks: Vec<ExtractedBlock>,
51    heading_stack: Vec<String>,
52    next_offset: usize,
53    title: Option<String>,
54    first_h1: Option<String>,
55}
56
57impl ExtractionState {
58    fn new(title: Option<String>) -> Self {
59        Self {
60            blocks: Vec::new(),
61            heading_stack: Vec::new(),
62            next_offset: 0,
63            title,
64            first_h1: None,
65        }
66    }
67
68    fn walk_element(&mut self, element: ElementRef<'_>) -> bool {
69        let name = element_name(element);
70        if should_skip_element(element) {
71            return false;
72        }
73
74        if let Some(kind) = block_kind_for(name) {
75            let text = match kind {
76                BlockKind::CodeFence => collect_preserved_text(element),
77                _ => collect_normal_text(element),
78            };
79            self.push_block(kind, name, text);
80            return true;
81        }
82
83        let mut emitted_child = false;
84        let mut residual = TextCollector::normal();
85        for child in element.children() {
86            match child.value() {
87                Node::Text(text) => residual.push(text),
88                Node::Element(_) => {
89                    let Some(child_element) = ElementRef::wrap(child) else {
90                        continue;
91                    };
92                    let child_name = element_name(child_element);
93                    if should_skip_element(child_element) {
94                        continue;
95                    }
96                    if is_text_boundary_element(child_name) {
97                        residual.push_boundary();
98                        continue;
99                    }
100
101                    if block_kind_for(child_name).is_some() || is_structural_container(child_name) {
102                        emitted_child |= self.push_residual_paragraph(&mut residual);
103                        emitted_child |= self.walk_element(child_element);
104                    } else {
105                        collect_text_from_element(child_element, &mut residual);
106                    }
107                }
108                _ => {}
109            }
110        }
111
112        emitted_child |= self.push_residual_paragraph(&mut residual);
113        emitted_child
114    }
115
116    fn push_residual_paragraph(&mut self, residual: &mut TextCollector) -> bool {
117        let text = residual.take();
118        if text.is_empty() {
119            return false;
120        }
121
122        self.push_block(BlockKind::Paragraph, "p", text);
123        true
124    }
125
126    fn push_block(&mut self, kind: BlockKind, element_name: &str, text: String) {
127        if text.is_empty() {
128            return;
129        }
130
131        let heading_path = self.heading_stack.clone();
132        let offset = self.next_offset;
133        let length = text.len();
134        self.next_offset = self.next_offset.saturating_add(length).saturating_add(2);
135
136        if kind == BlockKind::Heading {
137            let heading = text.clone();
138            if let Some(level) = heading_level(element_name) {
139                if level == 1 && self.first_h1.is_none() {
140                    self.first_h1 = Some(heading.clone());
141                }
142                apply_heading(&mut self.heading_stack, level, heading);
143            }
144        }
145
146        self.blocks.push(ExtractedBlock {
147            text,
148            offset,
149            length,
150            kind,
151            heading_path,
152            attrs: HashMap::new(),
153        });
154    }
155}
156
157fn document_title(document: &Html) -> Option<String> {
158    let selector = Selector::parse("title").expect("valid title selector");
159    document
160        .select(&selector)
161        .next()
162        .map(collect_normal_text)
163        .filter(|title| !title.is_empty())
164}
165
166fn element_name(element: ElementRef<'_>) -> &str {
167    element.value().name()
168}
169
170fn should_skip_element(element: ElementRef<'_>) -> bool {
171    if should_skip_element_name(element_name(element)) {
172        return true;
173    }
174
175    element.value().attr("hidden").is_some()
176        || element
177            .value()
178            .attr("aria-hidden")
179            .is_some_and(|value| value.trim().eq_ignore_ascii_case("true"))
180        || element
181            .value()
182            .attr("style")
183            .is_some_and(style_declares_hidden)
184}
185
186fn should_skip_element_name(name: &str) -> bool {
187    matches!(
188        name,
189        "head" | "script" | "style" | "template" | "noscript" | "svg" | "canvas" | "math"
190    )
191}
192
193fn block_kind_for(name: &str) -> Option<BlockKind> {
194    if heading_level(name).is_some() {
195        return Some(BlockKind::Heading);
196    }
197
198    match name {
199        "p" => Some(BlockKind::Paragraph),
200        "li" => Some(BlockKind::ListItem),
201        "dt" | "dd" => Some(BlockKind::Paragraph),
202        "blockquote" => Some(BlockKind::BlockQuote),
203        "pre" => Some(BlockKind::CodeFence),
204        _ => None,
205    }
206}
207
208fn is_text_boundary_element(name: &str) -> bool {
209    matches!(name, "br" | "hr")
210}
211
212fn is_structural_container(name: &str) -> bool {
213    matches!(
214        name,
215        "html"
216            | "body"
217            | "main"
218            | "article"
219            | "section"
220            | "div"
221            | "header"
222            | "footer"
223            | "nav"
224            | "aside"
225            | "ul"
226            | "ol"
227            | "menu"
228            | "dl"
229            | "table"
230            | "thead"
231            | "tbody"
232            | "tfoot"
233            | "tr"
234            | "td"
235            | "th"
236            | "caption"
237            | "figure"
238            | "figcaption"
239    )
240}
241
242fn style_declares_hidden(style: &str) -> bool {
243    let mut display: Option<StyleDeclarationState> = None;
244    let mut visibility: Option<StyleDeclarationState> = None;
245
246    for declaration in style.split(';') {
247        let Some((raw_name, raw_value)) = declaration.split_once(':') else {
248            continue;
249        };
250        let name = raw_name.trim().to_ascii_lowercase();
251        let important = raw_value
252            .split('!')
253            .skip(1)
254            .any(|suffix| suffix.trim().eq_ignore_ascii_case("important"));
255        let value = raw_value.split('!').next().unwrap_or(raw_value).trim();
256
257        match name.as_str() {
258            "display" => apply_style_state(
259                &mut display,
260                StyleDeclarationState {
261                    important,
262                    hidden: value.eq_ignore_ascii_case("none"),
263                },
264            ),
265            "visibility" => apply_style_state(
266                &mut visibility,
267                StyleDeclarationState {
268                    important,
269                    hidden: value.eq_ignore_ascii_case("hidden")
270                        || value.eq_ignore_ascii_case("collapse"),
271                },
272            ),
273            _ => {}
274        }
275    }
276
277    display.is_some_and(|state| state.hidden) || visibility.is_some_and(|state| state.hidden)
278}
279
280#[derive(Clone, Copy)]
281struct StyleDeclarationState {
282    important: bool,
283    hidden: bool,
284}
285
286fn apply_style_state(current: &mut Option<StyleDeclarationState>, next: StyleDeclarationState) {
287    if current.is_none_or(|state| next.important || !state.important) {
288        *current = Some(next);
289    }
290}
291
292fn heading_level(name: &str) -> Option<usize> {
293    let bytes = name.as_bytes();
294    if bytes.len() == 2 && bytes[0] == b'h' && (b'1'..=b'6').contains(&bytes[1]) {
295        return Some((bytes[1] - b'0') as usize);
296    }
297    None
298}
299
300fn apply_heading(stack: &mut Vec<String>, level: usize, heading: String) {
301    while stack.len() >= level {
302        stack.pop();
303    }
304    stack.push(heading);
305}
306
307fn collect_normal_text(element: ElementRef<'_>) -> String {
308    let mut collector = TextCollector::normal();
309    collect_text_from_element(element, &mut collector);
310    collector.finish()
311}
312
313fn collect_preserved_text(element: ElementRef<'_>) -> String {
314    let mut collector = TextCollector::preserve();
315    collect_text_from_element(element, &mut collector);
316    trim_preserved_text(collector.finish().as_str())
317}
318
319fn collect_text_from_element(element: ElementRef<'_>, collector: &mut TextCollector) {
320    if should_skip_element(element) {
321        return;
322    }
323
324    for child in element.children() {
325        match child.value() {
326            Node::Text(text) => collector.push(text),
327            Node::Element(_) => {
328                if let Some(child_element) = ElementRef::wrap(child) {
329                    let child_name = element_name(child_element);
330                    if should_skip_element(child_element) {
331                        continue;
332                    }
333
334                    if is_text_boundary_element(child_name) {
335                        collector.push_boundary();
336                    } else if block_kind_for(child_name).is_some()
337                        || is_structural_container(child_name)
338                    {
339                        collector.push_boundary();
340                        collect_text_from_element(child_element, collector);
341                        collector.push_boundary();
342                    } else {
343                        collect_text_from_element(child_element, collector);
344                    }
345                }
346            }
347            _ => {}
348        }
349    }
350}
351
352enum TextMode {
353    Normal,
354    Preserve,
355}
356
357struct TextCollector {
358    text: String,
359    mode: TextMode,
360    last_was_space: bool,
361}
362
363impl TextCollector {
364    fn normal() -> Self {
365        Self {
366            text: String::new(),
367            mode: TextMode::Normal,
368            last_was_space: false,
369        }
370    }
371
372    fn preserve() -> Self {
373        Self {
374            text: String::new(),
375            mode: TextMode::Preserve,
376            last_was_space: false,
377        }
378    }
379
380    fn push(&mut self, raw: &str) {
381        match self.mode {
382            TextMode::Normal => self.push_normal(raw),
383            TextMode::Preserve => self.text.push_str(raw),
384        }
385    }
386
387    fn push_boundary(&mut self) {
388        match self.mode {
389            TextMode::Normal => {
390                if !self.text.is_empty() && !self.last_was_space {
391                    self.text.push(' ');
392                    self.last_was_space = true;
393                }
394            }
395            TextMode::Preserve => {
396                if !self.text.ends_with('\n') {
397                    self.text.push('\n');
398                }
399                self.last_was_space = false;
400            }
401        }
402    }
403
404    fn push_normal(&mut self, raw: &str) {
405        for ch in raw.chars() {
406            if ch.is_whitespace() {
407                if !self.text.is_empty() && !self.last_was_space {
408                    self.text.push(' ');
409                    self.last_was_space = true;
410                }
411            } else {
412                self.text.push(ch);
413                self.last_was_space = false;
414            }
415        }
416    }
417
418    fn finish(self) -> String {
419        match self.mode {
420            TextMode::Normal => self.text.trim().to_string(),
421            TextMode::Preserve => self.text,
422        }
423    }
424
425    fn take(&mut self) -> String {
426        let text = std::mem::take(&mut self.text);
427        self.last_was_space = false;
428        match self.mode {
429            TextMode::Normal => text.trim().to_string(),
430            TextMode::Preserve => text,
431        }
432    }
433}
434
435fn trim_preserved_text(text: &str) -> String {
436    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
437    let lines = normalized.lines().collect::<Vec<_>>();
438    let start = lines
439        .iter()
440        .position(|line| !line.trim().is_empty())
441        .unwrap_or(lines.len());
442    let end = lines
443        .iter()
444        .rposition(|line| !line.trim().is_empty())
445        .map(|index| index + 1)
446        .unwrap_or(start);
447
448    lines[start..end].join("\n")
449}
450
451#[cfg(test)]
452mod tests {
453    use std::path::Path;
454
455    use crate::ingest::extract::{BlockKind, Extractor};
456    use crate::ingest::html::HtmlExtractor;
457
458    #[test]
459    fn extracts_structural_html_blocks() {
460        let extractor = HtmlExtractor;
461        assert_eq!(extractor.profile_key(), "html");
462
463        let source = br#"<!doctype html>
464<html>
465  <head>
466    <title>Guide Title</title>
467    <style>.hidden { display: none; }</style>
468    <script>ignored_script()</script>
469  </head>
470  <body>
471    <h1>Guide</h1>
472    <p>Alpha <strong>HTML</strong> &amp; canonical text.</p>
473    <ul><li>First item</li></ul>
474    <blockquote>Quoted text</blockquote>
475    <pre><code>
476fn main() {}
477    </code></pre>
478  </body>
479</html>"#;
480
481        let doc = extractor
482            .extract(Path::new("docs/guide.html"), source)
483            .expect("extract html");
484
485        assert_eq!(doc.title.as_deref(), Some("Guide Title"));
486        assert!(doc
487            .blocks
488            .iter()
489            .any(|block| block.kind == BlockKind::Heading && block.text == "Guide"));
490        assert!(doc.blocks.iter().any(|block| {
491            block.kind == BlockKind::Paragraph
492                && block.text == "Alpha HTML & canonical text."
493                && block.heading_path == vec!["Guide".to_string()]
494        }));
495        assert!(doc
496            .blocks
497            .iter()
498            .any(|block| block.kind == BlockKind::ListItem && block.text == "First item"));
499        assert!(doc
500            .blocks
501            .iter()
502            .any(|block| block.kind == BlockKind::BlockQuote && block.text == "Quoted text"));
503        assert!(doc.blocks.iter().any(|block| {
504            block.kind == BlockKind::CodeFence && block.text.contains("fn main() {}")
505        }));
506        assert!(!doc
507            .blocks
508            .iter()
509            .any(|block| block.text.contains("ignored_script")));
510    }
511
512    #[test]
513    fn uses_first_h1_as_title_when_title_is_missing() {
514        let extractor = HtmlExtractor;
515        let doc = extractor
516            .extract(
517                Path::new("docs/guide.html"),
518                b"<main><h2>Section</h2><p>before</p><h1>Guide</h1><p>body</p></main>",
519            )
520            .expect("extract html");
521
522        assert_eq!(doc.title.as_deref(), Some("Guide"));
523    }
524
525    #[test]
526    fn preserves_visible_text_from_mixed_unrecognized_containers() {
527        let extractor = HtmlExtractor;
528        let doc = extractor
529            .extract(
530                Path::new("docs/prices.html"),
531                br#"<body>
532Lead text.
533<p>Intro paragraph.</p>
534<table><tr><td>Price tabletarget</td></tr></table>
535<span>Tail text.</span>
536</body>"#,
537            )
538            .expect("extract html");
539
540        let texts = doc
541            .blocks
542            .iter()
543            .map(|block| block.text.as_str())
544            .collect::<Vec<_>>();
545        assert!(texts.iter().any(|text| *text == "Lead text."));
546        assert!(texts.iter().any(|text| *text == "Intro paragraph."));
547        assert!(texts.iter().any(|text| *text == "Price tabletarget"));
548        assert!(texts.iter().any(|text| *text == "Tail text."));
549        assert_eq!(
550            texts
551                .iter()
552                .filter(|text| text.contains("Intro paragraph."))
553                .count(),
554            1
555        );
556    }
557
558    #[test]
559    fn preserves_boundaries_for_html_separator_elements() {
560        let extractor = HtmlExtractor;
561        let doc = extractor
562            .extract(
563                Path::new("docs/separators.html"),
564                br#"<body>
565<p>alpha<br>beta brtarget</p>
566<dl><dt>Term</dt><dd>Definition ddtarget</dd></dl>
567<div><span>left</span><hr><span>right hrtarget</span></div>
568</body>"#,
569            )
570            .expect("extract html");
571
572        let canonical = doc
573            .blocks
574            .iter()
575            .map(|block| block.text.as_str())
576            .collect::<Vec<_>>()
577            .join("\n\n");
578        assert!(canonical.contains("alpha beta brtarget"));
579        assert!(canonical.contains("Term\n\nDefinition ddtarget"));
580        assert!(canonical.contains("left right hrtarget"));
581        assert!(!canonical.contains("alphabeta"));
582        assert!(!canonical.contains("TermDefinition"));
583        assert!(!canonical.contains("leftright"));
584    }
585
586    #[test]
587    fn preserves_boundaries_for_nested_block_children() {
588        let extractor = HtmlExtractor;
589        let doc = extractor
590            .extract(
591                Path::new("docs/nested.html"),
592                br#"<body>
593<blockquote><p>Alpha quote</p><p>Beta quotetarget</p></blockquote>
594<ul><li><p>Parent item</p><p>Child paragraph listtarget</p></li></ul>
595</body>"#,
596            )
597            .expect("extract html");
598
599        let canonical = doc
600            .blocks
601            .iter()
602            .map(|block| block.text.as_str())
603            .collect::<Vec<_>>()
604            .join("\n\n");
605        assert!(canonical.contains("Alpha quote Beta quotetarget"));
606        assert!(canonical.contains("Parent item Child paragraph listtarget"));
607        assert!(!canonical.contains("quoteBeta"));
608        assert!(!canonical.contains("itemChild"));
609    }
610
611    #[test]
612    fn skips_hidden_html_elements() {
613        let extractor = HtmlExtractor;
614        let doc = extractor
615            .extract(
616                Path::new("docs/hidden.html"),
617                br#"<body>
618<p>Visible target</p>
619<div hidden>secret hiddenword</div>
620<section aria-hidden=" true "><p>aria hiddenword</p></section>
621<div style="display: none">style hiddenword</div>
622<div style="visibility:hidden !important">visibility hiddenword</div>
623<div style="display:none !important; display:block">important hiddenword</div>
624<div style="display:none; display:block">Actually visible visibletarget</div>
625<div style="visibility:hidden; visibility:visible !important">Visible important importanttarget</div>
626</body>"#,
627            )
628            .expect("extract html");
629
630        let canonical = doc
631            .blocks
632            .iter()
633            .map(|block| block.text.as_str())
634            .collect::<Vec<_>>()
635            .join("\n\n");
636        assert!(canonical.contains("Visible target"));
637        assert!(canonical.contains("Actually visible visibletarget"));
638        assert!(canonical.contains("Visible important importanttarget"));
639        assert!(!canonical.contains("hiddenword"));
640    }
641
642    #[test]
643    fn rejects_non_utf8_html_bytes() {
644        let extractor = HtmlExtractor;
645        let err = extractor
646            .extract(Path::new("docs/page.html"), &[0xff, 0xfe, 0xfd])
647            .expect_err("invalid utf8 should fail");
648        assert!(err.to_string().contains("non-utf8 html input"));
649    }
650}