Skip to main content

orbok_extract/
html.rs

1//! HTML text extractor.
2//!
3//! Strips HTML tags with a simple state-machine parser and preserves
4//! visible text content. Block-level elements (`p`, `div`, `h1`–`h6`,
5//! `li`, `td`, `th`, `br`) produce paragraph boundaries.
6//! `<h1>`–`<h6>` headings populate `heading_path`.
7//!
8//! Security: no JavaScript execution, no external resource loading,
9//! no DOM construction. Pure text extraction only (RFC-015 §15).
10
11use crate::normalize::normalize_document;
12use crate::types::{
13    DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17
18const EXTRACTOR_NAME: &str = "html";
19const EXTRACTOR_VERSION: &str = "v1";
20
21pub struct HtmlExtractor;
22
23impl DocumentExtractor for HtmlExtractor {
24    fn name(&self) -> &'static str { EXTRACTOR_NAME }
25    fn version(&self) -> &'static str { EXTRACTOR_VERSION }
26    fn supported_extensions(&self) -> &'static [&'static str] { &["html", "htm"] }
27
28    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
29        let content = std::fs::read_to_string(&path.canonical)?;
30        let blocks = extract_blocks(&content);
31        let mut segments = Vec::new();
32        let mut total_chars = 0u64;
33        let mut line = 1u32;
34
35        for block in &blocks {
36            let norm = normalize_document(&block.text);
37            if norm.trim().is_empty() { line += 1; continue; }
38            total_chars += norm.len() as u64;
39            segments.push(ExtractedSegment {
40                kind: block.kind,
41                text: norm,
42                line_start: line,
43                line_end: line,
44                heading_path: block.heading.clone(),
45                location_quality: LocationQuality::Approximate,
46            });
47            line += 1;
48        }
49
50        Ok(ExtractOutput {
51            extractor_name: EXTRACTOR_NAME.to_string(),
52            extractor_version: EXTRACTOR_VERSION.to_string(),
53            normalization_version: NORMALIZATION_VERSION.to_string(),
54            segments,
55            char_count: total_chars,
56        })
57    }
58}
59
60struct Block {
61    text: String,
62    kind: SegmentKind,
63    heading: Option<String>,
64}
65
66/// Block-level elements that cause paragraph breaks.
67const BLOCK_TAGS: &[&str] = &["p", "div", "section", "article", "aside",
68    "h1", "h2", "h3", "h4", "h5", "h6", "li", "dt", "dd",
69    "td", "th", "caption", "blockquote", "pre", "br"];
70const HEADING_TAGS: &[&str] = &["h1", "h2", "h3", "h4", "h5", "h6"];
71/// Tags whose content should be suppressed entirely.
72const SKIP_TAGS: &[&str] = &["script", "style", "head", "noscript", "template"];
73
74fn extract_blocks(html: &str) -> Vec<Block> {
75    let mut blocks = Vec::new();
76    let mut current_text = String::new();
77    let mut current_heading: Option<String> = None;
78    let mut heading_trail: Vec<String> = Vec::new();
79    let mut skip_depth: usize = 0;
80    let mut skip_tag = "";
81    let mut pos = 0;
82    let chars: Vec<char> = html.chars().collect();
83    let n = chars.len();
84
85    let flush = |text: &mut String, heading: &Option<String>, blocks: &mut Vec<Block>| {
86        let t = text.trim().to_string();
87        if !t.is_empty() {
88            blocks.push(Block { text: t, kind: SegmentKind::Paragraph, heading: heading.clone() });
89        }
90        text.clear();
91    };
92
93    while pos < n {
94        if chars[pos] == '<' {
95            // Collect tag
96            let mut tag_end = pos + 1;
97            while tag_end < n && chars[tag_end] != '>' { tag_end += 1; }
98            let tag_str: String = chars[pos..tag_end.min(n)].iter().collect();
99            let is_close = tag_str.starts_with("</");
100            let tag_name = tag_str
101                .trim_start_matches('<')
102                .trim_start_matches('/')
103                .split_whitespace()
104                .next()
105                .unwrap_or("")
106                .to_ascii_lowercase();
107
108            if skip_depth > 0 {
109                if is_close && tag_name == skip_tag { skip_depth -= 1; }
110                else if !is_close && tag_name == skip_tag { skip_depth += 1; }
111            } else if !is_close && SKIP_TAGS.contains(&tag_name.as_str()) {
112                skip_tag = SKIP_TAGS.iter().find(|&&t| t == tag_name.as_str()).copied().unwrap_or("");
113                skip_depth = 1;
114                flush(&mut current_text, &current_heading, &mut blocks);
115            } else if is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
116                // Closing a heading: record it and update heading_path context.
117                let heading_text = current_text.trim().to_string();
118                if !heading_text.is_empty() {
119                    let level = tag_name[1..].parse::<usize>().unwrap_or(1);
120                    heading_trail.truncate(level.saturating_sub(1));
121                    heading_trail.push(heading_text.clone());
122                    current_heading = Some(heading_trail.join(" > "));
123                    blocks.push(Block {
124                        text: heading_text,
125                        kind: SegmentKind::Heading,
126                        heading: current_heading.clone(),
127                    });
128                    current_text.clear();
129                }
130            } else if BLOCK_TAGS.contains(&tag_name.as_str()) {
131                flush(&mut current_text, &current_heading, &mut blocks);
132                if !is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
133                    let level = tag_name[1..].parse::<usize>().unwrap_or(1);
134                    heading_trail.truncate(level.saturating_sub(1));
135                }
136            }
137            pos = tag_end + 1;
138        } else if skip_depth == 0 {
139            // Decode common HTML entities inline.
140            let c = chars[pos];
141            if c == '&' {
142                let semi = chars[pos..].iter().position(|&x| x == ';').map(|p| pos + p);
143                if let Some(end) = semi {
144                    let entity: String = chars[pos..=end].iter().collect();
145                    match entity.as_str() {
146                        "&amp;" => current_text.push('&'),
147                        "&lt;" => current_text.push('<'),
148                        "&gt;" => current_text.push('>'),
149                        "&quot;" | "&ldquo;" | "&rdquo;" => current_text.push('"'),
150                        "&apos;" | "&lsquo;" | "&rsquo;" => current_text.push('\''),
151                        "&nbsp;" | "&#160;" => current_text.push(' '),
152                        _ => current_text.push_str(&entity),
153                    }
154                    pos = end + 1;
155                    continue;
156                }
157            }
158            current_text.push(c);
159            pos += 1;
160        } else {
161            pos += 1;
162        }
163    }
164    flush(&mut current_text, &current_heading, &mut blocks);
165    blocks
166}