1use std::collections::HashMap;
2use std::path::Path;
3
4use scraper::{ElementRef, Html, Node, Selector};
5
6use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
7use crate::Result;
8
9pub struct HtmlExtractor;
10
11impl Extractor for HtmlExtractor {
12 fn supports(&self) -> &[&str] {
13 &["html", "htm"]
14 }
15
16 fn profile_key(&self) -> &'static str {
17 "html"
18 }
19
20 fn version(&self) -> u32 {
21 4
22 }
23
24 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
25 let source = std::str::from_utf8(bytes).map_err(|err| {
26 kbolt_types::KboltError::InvalidInput(format!("non-utf8 html input: {err}"))
27 })?;
28 let document = Html::parse_document(source);
29 let mut state = ExtractionState::new(document_title(&document));
30
31 let body_selector = Selector::parse("body").expect("valid body selector");
32 let bodies = document.select(&body_selector).collect::<Vec<_>>();
33 if bodies.is_empty() {
34 state.walk_element(document.root_element());
35 } else {
36 for body in bodies {
37 state.walk_element(body);
38 }
39 }
40
41 Ok(ExtractedDocument {
42 blocks: state.blocks,
43 metadata: HashMap::new(),
44 title: state.title.or(state.first_h1),
45 })
46 }
47}
48
49struct ExtractionState {
50 blocks: Vec<ExtractedBlock>,
51 heading_stack: Vec<String>,
52 next_offset: usize,
53 title: Option<String>,
54 first_h1: Option<String>,
55}
56
57impl ExtractionState {
58 fn new(title: Option<String>) -> Self {
59 Self {
60 blocks: Vec::new(),
61 heading_stack: Vec::new(),
62 next_offset: 0,
63 title,
64 first_h1: None,
65 }
66 }
67
68 fn walk_element(&mut self, element: ElementRef<'_>) -> bool {
69 let name = element_name(element);
70 if should_skip_element(element) {
71 return false;
72 }
73
74 if let Some(kind) = block_kind_for(name) {
75 let text = match kind {
76 BlockKind::CodeFence => collect_preserved_text(element),
77 _ => collect_normal_text(element),
78 };
79 self.push_block(kind, name, text);
80 return true;
81 }
82
83 let mut emitted_child = false;
84 let mut residual = TextCollector::normal();
85 for child in element.children() {
86 match child.value() {
87 Node::Text(text) => residual.push(text),
88 Node::Element(_) => {
89 let Some(child_element) = ElementRef::wrap(child) else {
90 continue;
91 };
92 let child_name = element_name(child_element);
93 if should_skip_element(child_element) {
94 continue;
95 }
96 if is_text_boundary_element(child_name) {
97 residual.push_boundary();
98 continue;
99 }
100
101 if block_kind_for(child_name).is_some() || is_structural_container(child_name) {
102 emitted_child |= self.push_residual_paragraph(&mut residual);
103 emitted_child |= self.walk_element(child_element);
104 } else {
105 collect_text_from_element(child_element, &mut residual);
106 }
107 }
108 _ => {}
109 }
110 }
111
112 emitted_child |= self.push_residual_paragraph(&mut residual);
113 emitted_child
114 }
115
116 fn push_residual_paragraph(&mut self, residual: &mut TextCollector) -> bool {
117 let text = residual.take();
118 if text.is_empty() {
119 return false;
120 }
121
122 self.push_block(BlockKind::Paragraph, "p", text);
123 true
124 }
125
126 fn push_block(&mut self, kind: BlockKind, element_name: &str, text: String) {
127 if text.is_empty() {
128 return;
129 }
130
131 let heading_path = self.heading_stack.clone();
132 let offset = self.next_offset;
133 let length = text.len();
134 self.next_offset = self.next_offset.saturating_add(length).saturating_add(2);
135
136 if kind == BlockKind::Heading {
137 let heading = text.clone();
138 if let Some(level) = heading_level(element_name) {
139 if level == 1 && self.first_h1.is_none() {
140 self.first_h1 = Some(heading.clone());
141 }
142 apply_heading(&mut self.heading_stack, level, heading);
143 }
144 }
145
146 self.blocks.push(ExtractedBlock {
147 text,
148 offset,
149 length,
150 kind,
151 heading_path,
152 attrs: HashMap::new(),
153 });
154 }
155}
156
157fn document_title(document: &Html) -> Option<String> {
158 let selector = Selector::parse("title").expect("valid title selector");
159 document
160 .select(&selector)
161 .next()
162 .map(collect_normal_text)
163 .filter(|title| !title.is_empty())
164}
165
166fn element_name(element: ElementRef<'_>) -> &str {
167 element.value().name()
168}
169
170fn should_skip_element(element: ElementRef<'_>) -> bool {
171 if should_skip_element_name(element_name(element)) {
172 return true;
173 }
174
175 element.value().attr("hidden").is_some()
176 || element
177 .value()
178 .attr("aria-hidden")
179 .is_some_and(|value| value.trim().eq_ignore_ascii_case("true"))
180 || element
181 .value()
182 .attr("style")
183 .is_some_and(style_declares_hidden)
184}
185
186fn should_skip_element_name(name: &str) -> bool {
187 matches!(
188 name,
189 "head" | "script" | "style" | "template" | "noscript" | "svg" | "canvas" | "math"
190 )
191}
192
193fn block_kind_for(name: &str) -> Option<BlockKind> {
194 if heading_level(name).is_some() {
195 return Some(BlockKind::Heading);
196 }
197
198 match name {
199 "p" => Some(BlockKind::Paragraph),
200 "li" => Some(BlockKind::ListItem),
201 "dt" | "dd" => Some(BlockKind::Paragraph),
202 "blockquote" => Some(BlockKind::BlockQuote),
203 "pre" => Some(BlockKind::CodeFence),
204 _ => None,
205 }
206}
207
208fn is_text_boundary_element(name: &str) -> bool {
209 matches!(name, "br" | "hr")
210}
211
212fn is_structural_container(name: &str) -> bool {
213 matches!(
214 name,
215 "html"
216 | "body"
217 | "main"
218 | "article"
219 | "section"
220 | "div"
221 | "header"
222 | "footer"
223 | "nav"
224 | "aside"
225 | "ul"
226 | "ol"
227 | "menu"
228 | "dl"
229 | "table"
230 | "thead"
231 | "tbody"
232 | "tfoot"
233 | "tr"
234 | "td"
235 | "th"
236 | "caption"
237 | "figure"
238 | "figcaption"
239 )
240}
241
242fn style_declares_hidden(style: &str) -> bool {
243 let mut display: Option<StyleDeclarationState> = None;
244 let mut visibility: Option<StyleDeclarationState> = None;
245
246 for declaration in style.split(';') {
247 let Some((raw_name, raw_value)) = declaration.split_once(':') else {
248 continue;
249 };
250 let name = raw_name.trim().to_ascii_lowercase();
251 let important = raw_value
252 .split('!')
253 .skip(1)
254 .any(|suffix| suffix.trim().eq_ignore_ascii_case("important"));
255 let value = raw_value.split('!').next().unwrap_or(raw_value).trim();
256
257 match name.as_str() {
258 "display" => apply_style_state(
259 &mut display,
260 StyleDeclarationState {
261 important,
262 hidden: value.eq_ignore_ascii_case("none"),
263 },
264 ),
265 "visibility" => apply_style_state(
266 &mut visibility,
267 StyleDeclarationState {
268 important,
269 hidden: value.eq_ignore_ascii_case("hidden")
270 || value.eq_ignore_ascii_case("collapse"),
271 },
272 ),
273 _ => {}
274 }
275 }
276
277 display.is_some_and(|state| state.hidden) || visibility.is_some_and(|state| state.hidden)
278}
279
280#[derive(Clone, Copy)]
281struct StyleDeclarationState {
282 important: bool,
283 hidden: bool,
284}
285
286fn apply_style_state(current: &mut Option<StyleDeclarationState>, next: StyleDeclarationState) {
287 if current.is_none_or(|state| next.important || !state.important) {
288 *current = Some(next);
289 }
290}
291
292fn heading_level(name: &str) -> Option<usize> {
293 let bytes = name.as_bytes();
294 if bytes.len() == 2 && bytes[0] == b'h' && (b'1'..=b'6').contains(&bytes[1]) {
295 return Some((bytes[1] - b'0') as usize);
296 }
297 None
298}
299
300fn apply_heading(stack: &mut Vec<String>, level: usize, heading: String) {
301 while stack.len() >= level {
302 stack.pop();
303 }
304 stack.push(heading);
305}
306
307fn collect_normal_text(element: ElementRef<'_>) -> String {
308 let mut collector = TextCollector::normal();
309 collect_text_from_element(element, &mut collector);
310 collector.finish()
311}
312
313fn collect_preserved_text(element: ElementRef<'_>) -> String {
314 let mut collector = TextCollector::preserve();
315 collect_text_from_element(element, &mut collector);
316 trim_preserved_text(collector.finish().as_str())
317}
318
319fn collect_text_from_element(element: ElementRef<'_>, collector: &mut TextCollector) {
320 if should_skip_element(element) {
321 return;
322 }
323
324 for child in element.children() {
325 match child.value() {
326 Node::Text(text) => collector.push(text),
327 Node::Element(_) => {
328 if let Some(child_element) = ElementRef::wrap(child) {
329 let child_name = element_name(child_element);
330 if should_skip_element(child_element) {
331 continue;
332 }
333
334 if is_text_boundary_element(child_name) {
335 collector.push_boundary();
336 } else if block_kind_for(child_name).is_some()
337 || is_structural_container(child_name)
338 {
339 collector.push_boundary();
340 collect_text_from_element(child_element, collector);
341 collector.push_boundary();
342 } else {
343 collect_text_from_element(child_element, collector);
344 }
345 }
346 }
347 _ => {}
348 }
349 }
350}
351
352enum TextMode {
353 Normal,
354 Preserve,
355}
356
357struct TextCollector {
358 text: String,
359 mode: TextMode,
360 last_was_space: bool,
361}
362
363impl TextCollector {
364 fn normal() -> Self {
365 Self {
366 text: String::new(),
367 mode: TextMode::Normal,
368 last_was_space: false,
369 }
370 }
371
372 fn preserve() -> Self {
373 Self {
374 text: String::new(),
375 mode: TextMode::Preserve,
376 last_was_space: false,
377 }
378 }
379
380 fn push(&mut self, raw: &str) {
381 match self.mode {
382 TextMode::Normal => self.push_normal(raw),
383 TextMode::Preserve => self.text.push_str(raw),
384 }
385 }
386
387 fn push_boundary(&mut self) {
388 match self.mode {
389 TextMode::Normal => {
390 if !self.text.is_empty() && !self.last_was_space {
391 self.text.push(' ');
392 self.last_was_space = true;
393 }
394 }
395 TextMode::Preserve => {
396 if !self.text.ends_with('\n') {
397 self.text.push('\n');
398 }
399 self.last_was_space = false;
400 }
401 }
402 }
403
404 fn push_normal(&mut self, raw: &str) {
405 for ch in raw.chars() {
406 if ch.is_whitespace() {
407 if !self.text.is_empty() && !self.last_was_space {
408 self.text.push(' ');
409 self.last_was_space = true;
410 }
411 } else {
412 self.text.push(ch);
413 self.last_was_space = false;
414 }
415 }
416 }
417
418 fn finish(self) -> String {
419 match self.mode {
420 TextMode::Normal => self.text.trim().to_string(),
421 TextMode::Preserve => self.text,
422 }
423 }
424
425 fn take(&mut self) -> String {
426 let text = std::mem::take(&mut self.text);
427 self.last_was_space = false;
428 match self.mode {
429 TextMode::Normal => text.trim().to_string(),
430 TextMode::Preserve => text,
431 }
432 }
433}
434
435fn trim_preserved_text(text: &str) -> String {
436 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
437 let lines = normalized.lines().collect::<Vec<_>>();
438 let start = lines
439 .iter()
440 .position(|line| !line.trim().is_empty())
441 .unwrap_or(lines.len());
442 let end = lines
443 .iter()
444 .rposition(|line| !line.trim().is_empty())
445 .map(|index| index + 1)
446 .unwrap_or(start);
447
448 lines[start..end].join("\n")
449}
450
451#[cfg(test)]
452mod tests {
453 use std::path::Path;
454
455 use crate::ingest::extract::{BlockKind, Extractor};
456 use crate::ingest::html::HtmlExtractor;
457
458 #[test]
459 fn extracts_structural_html_blocks() {
460 let extractor = HtmlExtractor;
461 assert_eq!(extractor.profile_key(), "html");
462
463 let source = br#"<!doctype html>
464<html>
465 <head>
466 <title>Guide Title</title>
467 <style>.hidden { display: none; }</style>
468 <script>ignored_script()</script>
469 </head>
470 <body>
471 <h1>Guide</h1>
472 <p>Alpha <strong>HTML</strong> & canonical text.</p>
473 <ul><li>First item</li></ul>
474 <blockquote>Quoted text</blockquote>
475 <pre><code>
476fn main() {}
477 </code></pre>
478 </body>
479</html>"#;
480
481 let doc = extractor
482 .extract(Path::new("docs/guide.html"), source)
483 .expect("extract html");
484
485 assert_eq!(doc.title.as_deref(), Some("Guide Title"));
486 assert!(doc
487 .blocks
488 .iter()
489 .any(|block| block.kind == BlockKind::Heading && block.text == "Guide"));
490 assert!(doc.blocks.iter().any(|block| {
491 block.kind == BlockKind::Paragraph
492 && block.text == "Alpha HTML & canonical text."
493 && block.heading_path == vec!["Guide".to_string()]
494 }));
495 assert!(doc
496 .blocks
497 .iter()
498 .any(|block| block.kind == BlockKind::ListItem && block.text == "First item"));
499 assert!(doc
500 .blocks
501 .iter()
502 .any(|block| block.kind == BlockKind::BlockQuote && block.text == "Quoted text"));
503 assert!(doc.blocks.iter().any(|block| {
504 block.kind == BlockKind::CodeFence && block.text.contains("fn main() {}")
505 }));
506 assert!(!doc
507 .blocks
508 .iter()
509 .any(|block| block.text.contains("ignored_script")));
510 }
511
512 #[test]
513 fn uses_first_h1_as_title_when_title_is_missing() {
514 let extractor = HtmlExtractor;
515 let doc = extractor
516 .extract(
517 Path::new("docs/guide.html"),
518 b"<main><h2>Section</h2><p>before</p><h1>Guide</h1><p>body</p></main>",
519 )
520 .expect("extract html");
521
522 assert_eq!(doc.title.as_deref(), Some("Guide"));
523 }
524
525 #[test]
526 fn preserves_visible_text_from_mixed_unrecognized_containers() {
527 let extractor = HtmlExtractor;
528 let doc = extractor
529 .extract(
530 Path::new("docs/prices.html"),
531 br#"<body>
532Lead text.
533<p>Intro paragraph.</p>
534<table><tr><td>Price tabletarget</td></tr></table>
535<span>Tail text.</span>
536</body>"#,
537 )
538 .expect("extract html");
539
540 let texts = doc
541 .blocks
542 .iter()
543 .map(|block| block.text.as_str())
544 .collect::<Vec<_>>();
545 assert!(texts.iter().any(|text| *text == "Lead text."));
546 assert!(texts.iter().any(|text| *text == "Intro paragraph."));
547 assert!(texts.iter().any(|text| *text == "Price tabletarget"));
548 assert!(texts.iter().any(|text| *text == "Tail text."));
549 assert_eq!(
550 texts
551 .iter()
552 .filter(|text| text.contains("Intro paragraph."))
553 .count(),
554 1
555 );
556 }
557
558 #[test]
559 fn preserves_boundaries_for_html_separator_elements() {
560 let extractor = HtmlExtractor;
561 let doc = extractor
562 .extract(
563 Path::new("docs/separators.html"),
564 br#"<body>
565<p>alpha<br>beta brtarget</p>
566<dl><dt>Term</dt><dd>Definition ddtarget</dd></dl>
567<div><span>left</span><hr><span>right hrtarget</span></div>
568</body>"#,
569 )
570 .expect("extract html");
571
572 let canonical = doc
573 .blocks
574 .iter()
575 .map(|block| block.text.as_str())
576 .collect::<Vec<_>>()
577 .join("\n\n");
578 assert!(canonical.contains("alpha beta brtarget"));
579 assert!(canonical.contains("Term\n\nDefinition ddtarget"));
580 assert!(canonical.contains("left right hrtarget"));
581 assert!(!canonical.contains("alphabeta"));
582 assert!(!canonical.contains("TermDefinition"));
583 assert!(!canonical.contains("leftright"));
584 }
585
586 #[test]
587 fn preserves_boundaries_for_nested_block_children() {
588 let extractor = HtmlExtractor;
589 let doc = extractor
590 .extract(
591 Path::new("docs/nested.html"),
592 br#"<body>
593<blockquote><p>Alpha quote</p><p>Beta quotetarget</p></blockquote>
594<ul><li><p>Parent item</p><p>Child paragraph listtarget</p></li></ul>
595</body>"#,
596 )
597 .expect("extract html");
598
599 let canonical = doc
600 .blocks
601 .iter()
602 .map(|block| block.text.as_str())
603 .collect::<Vec<_>>()
604 .join("\n\n");
605 assert!(canonical.contains("Alpha quote Beta quotetarget"));
606 assert!(canonical.contains("Parent item Child paragraph listtarget"));
607 assert!(!canonical.contains("quoteBeta"));
608 assert!(!canonical.contains("itemChild"));
609 }
610
611 #[test]
612 fn skips_hidden_html_elements() {
613 let extractor = HtmlExtractor;
614 let doc = extractor
615 .extract(
616 Path::new("docs/hidden.html"),
617 br#"<body>
618<p>Visible target</p>
619<div hidden>secret hiddenword</div>
620<section aria-hidden=" true "><p>aria hiddenword</p></section>
621<div style="display: none">style hiddenword</div>
622<div style="visibility:hidden !important">visibility hiddenword</div>
623<div style="display:none !important; display:block">important hiddenword</div>
624<div style="display:none; display:block">Actually visible visibletarget</div>
625<div style="visibility:hidden; visibility:visible !important">Visible important importanttarget</div>
626</body>"#,
627 )
628 .expect("extract html");
629
630 let canonical = doc
631 .blocks
632 .iter()
633 .map(|block| block.text.as_str())
634 .collect::<Vec<_>>()
635 .join("\n\n");
636 assert!(canonical.contains("Visible target"));
637 assert!(canonical.contains("Actually visible visibletarget"));
638 assert!(canonical.contains("Visible important importanttarget"));
639 assert!(!canonical.contains("hiddenword"));
640 }
641
642 #[test]
643 fn rejects_non_utf8_html_bytes() {
644 let extractor = HtmlExtractor;
645 let err = extractor
646 .extract(Path::new("docs/page.html"), &[0xff, 0xfe, 0xfd])
647 .expect_err("invalid utf8 should fail");
648 assert!(err.to_string().contains("non-utf8 html input"));
649 }
650}